version bump (#1260 )

Custom transforms (#1246 )
Fast Hadamard Transform (#1249 )
2025-09-10 21:37:50 +08:00 · 2024-07-11 11:17:55 -07:00 · 2024-07-10 18:00:01 -07:00 · 2024-07-09 20:39:01 -07:00 · 2024-07-07 21:37:00 -07:00 · 2024-07-07 21:34:59 -07:00
154 changed files with 11359 additions and 4797 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -144,6 +144,7 @@ jobs:
          name: Install dependencies
          command: |
            brew install python@<< parameters.python_version >>
+            brew install openmpi
            python<< parameters.python_version >> -m venv env
            source env/bin/activate
            pip install --upgrade pip
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -17,4 +17,4 @@ jobs:
          pip install pre-commit black isort clang-format
      - name: Run lint
        run: |
-          pre-commit run --all-files
+          pre-commit run --all-files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.14.1)
+  set(MLX_VERSION 0.16.0)
 endif()

 # --------------------- Processor tests -------------------------
@@ -83,24 +83,21 @@ elseif (MLX_BUILD_METAL)
                  OUTPUT_VARIABLE MACOS_VERSION
                  COMMAND_ERROR_IS_FATAL ANY)

-  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
-
-  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
-    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.2.diff)
-    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
-    set(MLX_METAL_VERSION METAL_3_1)
-  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
-    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.0.diff)
-    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
-    set(MLX_METAL_VERSION METAL_3_0)
-  else()
+  if (${MACOS_VERSION} LESS 14.0)
    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
  endif()
+  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
+
+  set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip)
+  # Get the metal version
+  execute_process(
+    COMMAND zsh "-c" "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal -E -x metal -P - | tail -1 | tr -d '\n'"
+    OUTPUT_VARIABLE MLX_METAL_VERSION
+    COMMAND_ERROR_IS_FATAL ANY)

  FetchContent_Declare(
    metal_cpp
    URL ${METAL_CPP_URL}
-    PATCH_COMMAND /usr/bin/patch -N -i ${METAL_CPP_PATCH} || true
  )

  FetchContent_MakeAvailable(metal_cpp)
@@ -115,7 +112,7 @@ elseif (MLX_BUILD_METAL)
    ${FOUNDATION_LIB}
    ${QUARTZ_LIB})

-  add_compile_definitions(${MLX_METAL_VERSION})
+  add_compile_definitions("MLX_METAL_VERSION=${MLX_METAL_VERSION}")
 endif()

 if (MLX_BUILD_CPU)
@@ -169,7 +166,19 @@ endif()

 find_package(MPI)
 if (MPI_FOUND)
+  execute_process(
+    COMMAND zsh "-c" "mpirun --version"
+    OUTPUT_VARIABLE MPI_VERSION
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+  if (${MPI_VERSION} MATCHES ".*Open MPI.*")
    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
+  else()
+    message(
+      WARNING
+      "MPI which is not OpenMPI found. Building without MPI."
+    )
+  endif() 
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -185,7 +185,7 @@ def prelu(x: torch.Tensor) -> torch.Tensor:
 def mish(x: torch.Tensor) -> torch.Tensor:
    y = x
    for _ in range(100):
-        return torch.nn.functional.mish(y)
+        y = torch.nn.functional.mish(y)
    sync_if_needed(x)


@@ -283,6 +283,14 @@ def topk(axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def step_function(x):
+    y = x
+    for i in range(100):
+        y = torch.where(y < 0, 0, 1)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def selu(x):
    y = x
@@ -446,5 +454,11 @@ if __name__ == "__main__":
    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

+    elif args.benchmark == "step":
+        print(bench(step_function, x))
+
+    elif args.benchmark == "selu":
+        print(bench(selu, x))
+
    else:
-        raise ValueError("Unknown benchmark")
+        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -16,7 +16,9 @@ def run_or_raise(*args, **kwargs):
        result = run(*args, capture_output=True, **kwargs)
        return float(result.stdout)
    except ValueError:
-        raise ValueError(f"stdout: {result.stdout}\nstderr: {result.stderr}")
+        raise ValueError(
+            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
+        )


 def compare(args):
--- a/benchmarks/python/compile_bench.py
+++ b/benchmarks/python/compile_bench.py
@@ -9,7 +9,6 @@ from time_utils import time_fn


 def bench_gelu():
-
    def gelu(x):
        return x * (1 + mx.erf(x / math.sqrt(2))) / 2

@@ -51,7 +50,6 @@ def bench_gelu():


 def bench_layernorm():
-
    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    mx.eval(weight, bias)
--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -54,7 +54,6 @@ def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):


 def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
-
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
--- a/benchmarks/python/fft_bench.py
+++ b/benchmarks/python/fft_bench.py
@@ -3,6 +3,8 @@
 import matplotlib
 import mlx.core as mx
 import numpy as np
+import sympy
+import torch
 from time_utils import measure_runtime

 matplotlib.use("Agg")
@@ -16,41 +18,100 @@ def bandwidth_gb(runtime_ms, system_size):
    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb


-def run_bench(system_size):
-    def fft(x):
-        out = mx.fft.fft(x)
+def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
+    def fft_mlx(x):
+        if dim == 1:
+            out = mx.fft.fft(x)
+        elif dim == 2:
+            out = mx.fft.fft2(x)
        mx.eval(out)
        return out

-    bandwidths = []
-    for k in range(4, 12):
-        n = 2**k
-        x = mx.random.uniform(shape=(system_size // n, n)).astype(mx.float32)
-        x = x.astype(mx.complex64)
-        mx.eval(x)
-        runtime_ms = measure_runtime(fft, x=x)
-        bandwidths.append(bandwidth_gb(runtime_ms, system_size))
+    def fft_mps(x):
+        if dim == 1:
+            out = torch.fft.fft(x)
+        elif dim == 2:
+            out = torch.fft.fft2(x)
+        torch.mps.synchronize()
+        return out

-    return bandwidths
+    bandwidths = []
+    for n in fft_sizes:
+        batch_size = system_size // n**dim
+        shape = [batch_size] + [n for _ in range(dim)]
+        if backend == "mlx":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = mx.array(x_np)
+            mx.eval(x)
+            fft = fft_mlx
+        elif backend == "mps":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = torch.tensor(x_np, device="mps")
+            torch.mps.synchronize()
+            fft = fft_mps
+        else:
+            raise NotImplementedError()
+        runtime_ms = measure_runtime(fft, x=x)
+        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
+        print(n, bandwidth)
+        bandwidths.append(bandwidth)
+
+    return np.array(bandwidths)


 def time_fft():
+    x = np.array(range(2, 512))
+    system_size = int(2**26)

-    with mx.stream(mx.cpu):
-        cpu_bandwidths = run_bench(system_size=int(2**22))
-
+    print("MLX GPU")
    with mx.stream(mx.gpu):
-        gpu_bandwidths = run_bench(system_size=int(2**29))
+        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

-    # plot bandwidths
-    x = [2**k for k in range(4, 12)]
-    plt.scatter(x, gpu_bandwidths, color="green", label="GPU")
-    plt.scatter(x, cpu_bandwidths, color="red", label="CPU")
-    plt.title("MLX FFT Benchmark")
-    plt.xlabel("N")
-    plt.ylabel("Bandwidth (GB/s)")
-    plt.legend()
-    plt.savefig("fft_plot.png")
+    print("MPS GPU")
+    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")
+
+    print("CPU")
+    system_size = int(2**20)
+    with mx.stream(mx.cpu):
+        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+
+    x = np.array(x)
+
+    all_indices = x - x[0]
+    radix_2to13 = (
+        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+    bluesteins = (
+        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+
+    for indices, name in [
+        (all_indices, "All"),
+        (radix_2to13, "Radix 2-13"),
+        (bluesteins, "Bluestein's"),
+    ]:
+        # plot bandwidths
+        print(name)
+        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
+        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
+        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
+        plt.title(f"MLX FFT Benchmark -- {name}")
+        plt.xlabel("N")
+        plt.ylabel("Bandwidth (GB/s)")
+        plt.legend()
+        plt.savefig(f"{name}.png")
+        plt.clf()
+
+    av_gpu_bandwidth = np.mean(gpu_bandwidths)
+    av_mps_bandwidth = np.mean(mps_bandwidths)
+    av_cpu_bandwidth = np.mean(cpu_bandwidths)
+    print("Average bandwidths:")
+    print("GPU:", av_gpu_bandwidth)
+    print("MPS:", av_mps_bandwidth)
+    print("CPU:", av_cpu_bandwidth)
+
+    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
+    print("Percent MLX faster than MPS: ", portion_faster * 100)


 if __name__ == "__main__":
--- a/benchmarks/python/hadamard_bench.py
+++ b/benchmarks/python/hadamard_bench.py
@@ -0,0 +1,70 @@
+import argparse
+
+import matplotlib
+import mlx.core as mx
+import numpy as np
+from time_utils import measure_runtime
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def had(x):
+    y = mx.hadamard_transform(x)
+    mx.eval(y)
+
+
+def copy(x):
+    y = x + 1.0
+    mx.eval(y)
+
+
+def run(dtype):
+    system_size = 2**26
+    outputs = {}
+    for test_fn in (had, copy):
+        for m in [1, 12, 20, 28]:
+            if test_fn == copy:
+                key = "copy"
+            elif m == 1:
+                key = "had_2^k"
+            else:
+                key = "had_m*2^k"
+            outputs.setdefault(key, {})
+            for k in range(7, 14):
+                n = m * 2**k
+                if n > 2**15:
+                    continue
+                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
+                x = mx.array(x_np)
+                runtime_ms = measure_runtime(test_fn, x=x)
+                bytes_per_gb = 1e9
+                ms_per_s = 1e3
+                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
+                bandwidth_gb = (
+                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
+                )
+                print(n, bandwidth_gb)
+                outputs[key][n] = bandwidth_gb
+
+    colors = {
+        "copy": "black",
+        "had_2^k": "steelblue",
+        "had_m*2^k": "skyblue",
+    }
+    for key, output in outputs.items():
+        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
+    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
+    plt.xlabel("N")
+    plt.ylabel("Bandwidth (GB/s)")
+    plt.legend()
+    plt.savefig(f"bench_{dtype.__name__}.png")
+    plt.clf()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fp16", action="store_true")
+    args = parser.parse_args()
+    dtype = np.float16 if args.fp16 else np.float32
+    run(dtype)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -0,0 +1,62 @@
+import argparse
+import math
+
+import mlx.core as mx
+from time_utils import time_fn
+
+MAX_SEQ = 300
+START_SEQ = 100
+SEQ_INCREMENT = 50
+
+
+def time_self_attention_primitives():
+    mx.random.seed(3)
+    B = 2
+    H = 38
+    D = 64
+    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
+        q = mx.random.uniform(shape=(B, H, R, D))
+        k = mx.random.uniform(shape=(B, H, R, D))
+        v = mx.random.uniform(shape=(B, H, R, D))
+        scale = 1.0 / math.sqrt(float(D))
+        mx.eval(q, k, v)
+
+        def sdpa_primitives(qs, ks, vs, alpha):
+            s = (alpha * qs) @ ks.transpose(0, 1, 3, 2)
+            p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
+            o = p @ vs
+            return o
+
+        time_fn(sdpa_primitives, q, k, v, scale)
+
+
+def time_self_attention_sdpa():
+    mx.random.seed(3)
+    B = 2
+    H = 38
+    D = 64
+    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
+        q = mx.random.uniform(shape=(B, H, R, D))
+        k = mx.random.uniform(shape=(B, H, R, D))
+        v = mx.random.uniform(shape=(B, H, R, D))
+        scale = 1.0 / math.sqrt(float(D))
+        mx.eval(q, k, v)
+
+        def sdpa_fused(qs, ks, vs, alpha):
+            o = mx.fast.scaled_dot_product_attention(qs, ks, vs, scale=alpha)
+            return o
+
+        time_fn(sdpa_fused, q, k, v, scale)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MLX benchmarks.")
+    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
+    args = parser.parse_args()
+    if args.gpu:
+        mx.set_default_device(mx.gpu)
+    else:
+        mx.set_default_device(mx.cpu)
+
+    time_self_attention_sdpa()
+    time_self_attention_primitives()
--- a/cmake/metal.14.0.diff
+++ b/cmake/metal.14.0.diff
@@ -1,36 +0,0 @@
-diff -ur Metal/MTLEvent.hpp MetalNew/MTLEvent.hpp
--- Metal/MTLEvent.hpp	2023-06-01 12:18:26
-+++ MetalNew/MTLEvent.hpp	2024-04-15 07:36:59
-@@ -62,6 +62,7 @@
- 
-     uint64_t                 signaledValue() const;
-     void                     setSignaledValue(uint64_t signaledValue);
-+    bool                     waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS);
- };
- 
- class SharedEventHandle : public NS::SecureCoding<SharedEventHandle>
-@@ -138,6 +139,11 @@
- _MTL_INLINE void MTL::SharedEvent::setSignaledValue(uint64_t signaledValue)
- {
-     Object::sendMessage<void>(this, _MTL_PRIVATE_SEL(setSignaledValue_), signaledValue);
-+}
-+
-+// method: waitUntilSignaledValue
-+_MTL_INLINE bool MTL::SharedEvent::waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS) {
-+    return Object::sendMessage<bool>(this, _MTL_PRIVATE_SEL(waitUntilSignaledValue_timeoutMS_), signaledValue, timeoutMS);
- }
- 
- // static method: alloc
-diff -ur Metal/MTLHeaderBridge.hpp MetalNew/MTLHeaderBridge.hpp
--- Metal/MTLHeaderBridge.hpp	2023-06-01 12:18:26
-+++ MetalNew/MTLHeaderBridge.hpp	2024-04-15 07:37:29
-@@ -1906,6 +1906,9 @@
-     "setShouldMaximizeConcurrentCompilation:");
- _MTL_PRIVATE_DEF_SEL(setSignaledValue_,
-     "setSignaledValue:");
-+_MTL_PRIVATE_DEF_SEL(
-+    waitUntilSignaledValue_timeoutMS_,
-+    "waitUntilSignaledValue:timeoutMS:");
- _MTL_PRIVATE_DEF_SEL(setSize_,
-     "setSize:");
- _MTL_PRIVATE_DEF_SEL(setSlice_,
--- a/cmake/metal.14.2.diff
+++ b/cmake/metal.14.2.diff
@@ -1,36 +0,0 @@
-diff -ur Metal/MTLEvent.hpp MetalNew/MTLEvent.hpp
--- Metal/MTLEvent.hpp	2024-04-15 07:12:10
-+++ MetalNew/MTLEvent.hpp	2024-04-15 07:15:50
-@@ -62,6 +62,7 @@
- 
-     uint64_t                 signaledValue() const;
-     void                     setSignaledValue(uint64_t signaledValue);
-+    bool                     waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS);
- };
- 
- class SharedEventHandle : public NS::SecureCoding<SharedEventHandle>
-@@ -138,6 +139,11 @@
- _MTL_INLINE void MTL::SharedEvent::setSignaledValue(uint64_t signaledValue)
- {
-     Object::sendMessage<void>(this, _MTL_PRIVATE_SEL(setSignaledValue_), signaledValue);
-+}
-+
-+// method: waitUntilSignaledValue
-+_MTL_INLINE bool MTL::SharedEvent::waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS) {
-+    return Object::sendMessage<bool>(this, _MTL_PRIVATE_SEL(waitUntilSignaledValue_timeoutMS_), signaledValue, timeoutMS);
- }
- 
- // static method: alloc
-diff -ur Metal/MTLHeaderBridge.hpp MetalNew/MTLHeaderBridge.hpp
--- Metal/MTLHeaderBridge.hpp	2024-04-15 07:12:10
-+++ MetalNew/MTLHeaderBridge.hpp	2024-04-15 07:16:15
-@@ -1918,6 +1918,9 @@
-     "setShouldMaximizeConcurrentCompilation:");
- _MTL_PRIVATE_DEF_SEL(setSignaledValue_,
-     "setSignaledValue:");
-+_MTL_PRIVATE_DEF_SEL(
-+    waitUntilSignaledValue_timeoutMS_,
-+    "waitUntilSignaledValue:timeoutMS:");
- _MTL_PRIVATE_DEF_SEL(setSize_,
-     "setSize:");
- _MTL_PRIVATE_DEF_SEL(setSlice_,
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -43,6 +43,7 @@ are the CPU and GPU.
   usage/function_transforms
   usage/compile
   usage/numpy
+   usage/distributed
   usage/using_streams

 .. toctree::
@@ -69,6 +70,7 @@ are the CPU and GPU.
   python/metal
   python/nn
   python/optimizers
+   python/distributed
   python/tree_utils

 .. toctree::
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -195,7 +195,7 @@ GGUF, you can do:

 .. code-block:: shell

-  cmake ..
+  cmake .. \
    -DCMAKE_BUILD_TYPE=MinSizeRel \
    -DBUILD_SHARED_LIBS=ON \
    -DMLX_BUILD_CPU=OFF \
--- a/docs/src/python/distributed.rst
+++ b/docs/src/python/distributed.rst
@@ -0,0 +1,19 @@
+.. _distributed:
+
+.. currentmodule:: mlx.core.distributed
+
+Distributed Communication
+==========================
+
+MLX provides a distributed communication package using MPI. The MPI library is
+loaded at runtime; if MPI is available then distributed communication is also
+made available.
+
+.. autosummary::
+   :toctree: _autosummary
+
+    Group
+    is_available
+    init
+    all_sum
+    all_gather
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -17,6 +17,8 @@ simple functions.
   gelu_approx
   gelu_fast_approx
   glu
+   hard_shrink
+   hard_tanh
   hardswish
   leaky_relu
   log_sigmoid
@@ -29,6 +31,7 @@ simple functions.
   sigmoid
   silu
   softmax
+   softmin
   softplus
   softshrink
   step
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -21,10 +21,15 @@ Layers
   Dropout3d
   Embedding
   GELU
+   GLU
   GroupNorm
   GRU
+   HardShrink
+   HardTanh
+   Hardswish
   InstanceNorm
   LayerNorm
+   LeakyReLU
   Linear
   LSTM
   MaxPool1d
@@ -36,13 +41,19 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
+   ReLU6
   RNN
   RoPE
   SELU
   Sequential
   SiLU
   SinusoidalPositionalEncoding
+   Softmin
   Softshrink
+   Softsign
+   Softmax
+   Softplus
   Step
+   Tanh
   Transformer
   Upsample
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -72,6 +72,7 @@ Operations
   gather_qmm
   greater
   greater_equal
+   hadamard_transform
   identity
   inner
   isclose
@@ -156,6 +157,7 @@ Operations
   tril
   triu
   var
+   view
   where
   zeros
   zeros_like
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -10,6 +10,7 @@ Transforms

   eval
   compile
+   custom_function
   disable_compile
   enable_compile
   grad
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -0,0 +1,166 @@
+.. _usage_distributed:
+
+Distributed Communication
+=========================
+
+.. currentmodule:: mlx.core.distributed
+
+MLX utilizes `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ to
+provide distributed communication operations that allow the computational cost
+of training or inference to be shared across many physical machines. You can
+see a list of the supported operations in the :ref:`API docs<distributed>`.
+
+.. note::
+   A lot of operations may not be supported or not as fast as they should be.
+   We are adding more and tuning the ones we have as we are figuring out the
+   best way to do distributed computing on Macs using MLX.
+
+Getting Started
+---------------
+
+MLX already comes with the ability to "talk" to MPI if it is installed on the
+machine. The minimal distributed program in MLX is as simple as:
+
+.. code:: python
+
+    import mlx.core as mx
+
+    world = mx.distributed.init()
+    x = mx.distributed.all_sum(mx.ones(10))
+    print(world.rank(), x)
+
+The program above sums the array ``mx.ones(10)`` across all
+distributed processes. If simply run with ``python``, however, only one
+process is launched and no distributed communication takes place.
+
+To launch the program in distributed mode we need to use ``mpirun`` or
+``mpiexec`` depending on the MPI installation. The simplest possible way is the
+following:
+
+.. code:: shell
+
+    $ mpirun -np 2 python test.py
+    1 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
+    0 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
+
+The above launches two processes on the same (local) machine and we can see
+both standard output streams. The processes send the array of 1s to each other
+and compute the sum which is printed. Launching with ``mpirun -np 4 ...`` would
+print 4 etc.
+
+Installing MPI
+---------------
+
+MPI can be installed with Homebrew, using the Anaconda package manager or
+compiled from source. Most of our testing is done using ``openmpi`` installed
+with the Anaconda package manager as follows:
+
+.. code:: shell
+
+    $ conda install openmpi
+
+Installing with Homebrew may require specifying the location of ``libmpi.dyld``
+so that MLX can find it and load it at runtime. This can simply be achieved by
+passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun``.
+
+.. code:: shell
+
+    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
+
+Setting up Remote Hosts
+-----------------------
+
+MPI can automatically connect to remote hosts and set up the communication over
+the network if the remote hosts can be accessed via ssh. A good checklist to
+debug connectivity issues is the following:
+
+* ``ssh hostname`` works from all machines to all machines without asking for
+  password or host confirmation
+* ``mpirun`` is accessible on all machines. You can call ``mpirun`` using its
+  full path to force all machines to use a specific path.
+* Ensure that the ``hostname`` used by MPI is the one that you have configured
+  in the ``.ssh/config`` files on all machines.
+
+.. note::
+  For an example hostname ``foo.bar.com`` MPI can use only ``foo`` as
+  the hostname passed to ssh if the current hostname matches ``*.bar.com``.
+
+An easy way to pass the host names to MPI is using a host file. A host file
+looks like the following, where ``host1`` and ``host2`` should be the fully
+qualified domain names or IPs for these hosts.
+
+.. code::
+
+    host1 slots=1
+    host2 slots=1
+
+When using MLX, it is very likely that you want to use 1 slot per host, ie one
+process per host.  The hostfile also needs to contain the current
+host if you want to run on the local host. Passing the host file to
+``mpirun`` is simply done using the ``--hostfile`` command line argument.
+
+Training Example
+----------------
+
+In this section we will adapt an MLX training loop to support data parallel
+distributed training. Namely, we will average the gradients across a set of
+hosts before applying them to the model.
+
+Our training loop looks like the following code snippet if we omit the model,
+dataset and optimizer initialization.
+
+.. code:: python
+
+    model = ...
+    optimizer = ...
+    dataset = ...
+
+    def step(model, x, y):
+        loss, grads = loss_grad_fn(model, x, y)
+        optimizer.update(model, grads)
+        return loss
+
+    for x, y in dataset:
+        loss = step(model, x, y)
+        mx.eval(loss, model.parameters())
+
+All we have to do to average the gradients across machines is perform an
+:func:`all_sum` and divide by the size of the :class:`Group`. Namely we
+have to :func:`mlx.utils.tree_map` the gradients with following function.
+
+.. code:: python
+
+    def all_avg(x):
+        return mx.distributed.all_sum(x) / mx.distributed.init().size()
+
+Putting everything together our training loop step looks as follows with
+everything else remaining the same.
+
+.. code:: python
+
+    from mlx.utils import tree_map
+
+    def all_reduce_grads(grads):
+        N = mx.distributed.init()
+        if N == 1:
+            return grads
+        return tree_map(
+                lambda x: mx.distributed.all_sum(x) / N,
+                grads)
+
+    def step(model, x, y):
+        loss, grads = loss_grad_fn(model, x, y)
+        grads = all_reduce_grads(grads)  # <--- This line was added
+        optimizer.update(model, grads)
+        return loss
+
+Tuning All Reduce
+-----------------
+
+We are working on improving the performance of all reduce on MLX but for now
+the two main things one can do to extract the most out of distributed training with MLX are:
+
+1. Perform a few large reductions instead of many small ones to improve
+   bandwidth and latency
+2. Pass ``--mca btl_tcp_links 4`` to ``mpirun`` to configure it to use 4 tcp
+   connections between each host to improve bandwidth
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -16,7 +16,7 @@ int main() {
  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;

  array x = ones({10});
-  array out = distributed::all_reduce_sum(x, global_group);
+  array out = distributed::all_sum(x, global_group);

  std::cout << out << std::endl;
 }
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -17,6 +17,10 @@ bool in_tracing() {
  return detail::InTracing::in_tracing();
 }

+bool retain_graph() {
+  return detail::RetainGraph::retain_graph();
+}
+
 } // namespace

 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
@@ -102,7 +106,7 @@ void array::eval() {
 }

 bool array::is_tracer() const {
-  return array_desc_->is_tracer && in_tracing();
+  return array_desc_->is_tracer && in_tracing() || retain_graph();
 }

 void array::set_data(allocator::Buffer buffer, deleter_t d) {
@@ -206,7 +210,7 @@ void array::ArrayDesc::init() {
    strides[i] = size;
    size *= shape[i];
  }
-  for (auto& in : inputs) {
+  for (const auto& in : inputs) {
    is_tracer |= in.is_tracer();
  }
 }
@@ -231,7 +235,7 @@ array::ArrayDesc::ArrayDesc(

 array::ArrayDesc::~ArrayDesc() {
  // When an array description is destroyed it will delete a bunch of arrays
-  // that may also destory their corresponding descriptions and so on and so
+  // that may also destroy their corresponding descriptions and so on and so
  // forth.
  //
  // This calls recursively the destructor and can result in stack overflow, we
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -73,32 +73,32 @@ class array {
      this->array_desc_ = other.array_desc_;
    }
    return *this;
-  };
+  }

  /** The size of the array's datatype in bytes. */
  size_t itemsize() const {
    return size_of(dtype());
-  };
+  }

  /** The number of elements in the array. */
  size_t size() const {
    return array_desc_->size;
-  };
+  }

  /** The number of bytes in the array. */
  size_t nbytes() const {
    return size() * itemsize();
-  };
+  }

  /** The number of dimensions of the array. */
  size_t ndim() const {
    return array_desc_->shape.size();
-  };
+  }

  /** The shape of the array as a vector of integers. */
  const std::vector<int>& shape() const {
    return array_desc_->shape;
-  };
+  }

  /**
   *  Get the size of the corresponding dimension.
@@ -107,12 +107,12 @@ class array {
   *  bounds checking. */
  int shape(int dim) const {
    return shape().at(dim < 0 ? dim + ndim() : dim);
-  };
+  }

  /** The strides of the array. */
  const std::vector<size_t>& strides() const {
    return array_desc_->strides;
-  };
+  }

  /**
   *  Get the stride of the corresponding dimension.
@@ -121,12 +121,12 @@ class array {
   *  bounds checking. */
  size_t strides(int dim) const {
    return strides().at(dim < 0 ? dim + ndim() : dim);
-  };
+  }

  /** Get the arrays data type. */
  Dtype dtype() const {
    return array_desc_->dtype;
-  };
+  }

  /** Evaluate the array. */
  void eval();
@@ -160,10 +160,10 @@ class array {

    friend bool operator==(const ArrayIterator& a, const ArrayIterator& b) {
      return a.arr.id() == b.arr.id() && a.idx == b.idx;
-    };
+    }
    friend bool operator!=(const ArrayIterator& a, const ArrayIterator& b) {
      return !(a == b);
-    };
+    }

   private:
    const array& arr;
@@ -209,7 +209,7 @@ class array {
    allocator::Buffer buffer;
    deleter_t d;
    Data(allocator::Buffer buffer, deleter_t d = allocator::free)
-        : buffer(buffer), d(d) {};
+        : buffer(buffer), d(d) {}
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
@@ -230,22 +230,22 @@ class array {
  /** The array's primitive. */
  Primitive& primitive() const {
    return *(array_desc_->primitive);
-  };
+  }

  /** A shared pointer to the array's primitive. */
  std::shared_ptr<Primitive>& primitive_ptr() const {
    return array_desc_->primitive;
-  };
+  }

  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
-  };
+  }

  /** The array's inputs. */
  const std::vector<array>& inputs() const {
    return array_desc_->inputs;
-  };
+  }

  std::vector<array>& inputs() {
    return array_desc_->inputs;
@@ -259,12 +259,12 @@ class array {
  /** The array's siblings. */
  const std::vector<array>& siblings() const {
    return array_desc_->siblings;
-  };
+  }

  /** The array's siblings. */
  std::vector<array>& siblings() {
    return array_desc_->siblings;
-  };
+  }

  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
@@ -281,7 +281,7 @@ class array {
    outputs.push_back(*this);
    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
    return outputs;
-  };
+  }

  /** Detach the array from the graph. */
  void detach();
@@ -289,19 +289,19 @@ class array {
  /** Get the Flags bit-field. */
  const Flags& flags() const {
    return array_desc_->flags;
-  };
+  }

  /** The size (in elements) of the underlying buffer the array points to. */
  size_t data_size() const {
    return array_desc_->data_size;
-  };
+  }

  allocator::Buffer& buffer() {
    return array_desc_->data->buffer;
-  };
+  }
  const allocator::Buffer& buffer() const {
    return array_desc_->data->buffer;
-  };
+  }

  // Return a copy of the shared pointer
  // to the array::Data struct
@@ -312,19 +312,20 @@ class array {
  template <typename T>
  T* data() {
    return static_cast<T*>(array_desc_->data_ptr);
-  };
+  }

  template <typename T>
  const T* data() const {
    return static_cast<T*>(array_desc_->data_ptr);
-  };
+  }

  enum Status { unscheduled, scheduled, available };

  bool is_available() const {
    return status() == Status::available;
  }
-  const Status status() const {
+
+  Status status() const {
    return array_desc_->status;
  }

--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@@ -1,9 +1,9 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <cassert>

+#include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
-#include <vecLib/vDSP.h>

 #include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -2,8 +2,7 @@

 #include <cassert>

-#include <vecLib/BNNS/bnns.h>
-#include <vecLib/cblas_new.h>
+#include <Accelerate/Accelerate.h>

 #include "mlx/backend/accelerate/utils.h"
 #include "mlx/backend/common/copy.h"
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -3,8 +3,7 @@
 #include <cassert>
 #include <cmath>

-#include <vecLib/vDSP.h>
-#include <vecLib/vForce.h>
+#include <Accelerate/Accelerate.h>

 #include "mlx/allocator.h"
 #include "mlx/backend/common/binary.h"
@@ -37,7 +36,7 @@ DEFAULT(Ceil)
 DEFAULT(Concatenate)
 DEFAULT(Conjugate)
 DEFAULT(Copy)
-DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(CustomTransforms)
 DEFAULT_MULTI(Depends)
 DEFAULT_MULTI(DivMod)
 DEFAULT(NumberOfElements)
@@ -51,6 +50,7 @@ DEFAULT(GatherMM)
 DEFAULT(GatherQMM)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
@@ -102,7 +102,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -117,7 +117,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vadd((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -132,7 +132,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vaddi((const int*)a, 1, (const int*)b, 1, (int*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x + y; });
+    eval(inputs, out);
  }
 }

@@ -287,7 +287,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -300,7 +300,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdivi((const int*)b, 1, (const int*)a, 1, (int*)o, 1, n);
        });
  } else if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -315,7 +315,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdiv((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x / y; });
+    eval(inputs, out);
  }
 }

@@ -326,12 +326,8 @@ void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    auto size = in.data_size();
    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
-  } else if (issubdtype(out.dtype(), inexact)) {
-    unary_fp(in, out, [](auto x) { return std::exp(x); });
  } else {
-    throw std::invalid_argument(
-        "[exp] Cannot exponentiate elements in array"
-        " with non floating point type.");
+    eval(inputs, out);
  }
 }

@@ -393,12 +389,8 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vvlog1pf(
        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
-  } else if (issubdtype(out.dtype(), inexact)) {
-    unary_fp(in, out, [](auto x) { return std::log1p(x); });
  } else {
-    throw std::invalid_argument(
-        "[log1p] Cannot compute log of elements in array with"
-        " non floating point type.");
+    eval(inputs, out);
  }
 }

@@ -408,7 +400,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -423,7 +415,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vmul((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x * y; });
+    eval(inputs, out);
  }
 }

@@ -434,7 +426,7 @@ void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else {
-    unary(in, out, [](auto x) { return -x; });
+    eval(inputs, out);
  }
 }

@@ -521,7 +513,7 @@ void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
  } else {
-    unary(in, out, [](auto x) { return x * x; });
+    eval(inputs, out);
  }
 }

@@ -547,7 +539,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -565,7 +557,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vsub((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -577,7 +569,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
        },
        UseDefaultBinaryOp());
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x - y; });
+    eval(inputs, out);
  }
 }

--- a/mlx/backend/accelerate/reduce.cpp
+++ b/mlx/backend/accelerate/reduce.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

+#include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
-#include <vecLib/vDSP.h>

 #include "mlx/backend/common/reduce.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -3,7 +3,10 @@
 #include <cassert>
 #include <limits>

+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_neon.h>
+#endif
+
 #include <simd/math.h>
 #include <simd/vector.h>

@@ -53,25 +56,26 @@ inline simd_float16 simd_fast_exp(simd_float16 x) {
  return (*(simd_float16*)&epart) * x;
 }

+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /**
 * The ARM neon equivalent of the fast exp above.
 */
 inline float16x8_t neon_fast_exp(float16x8_t x) {
-  x = vmulq_f16(x, vdupq_n_f16(1.442695)); // multiply with log_2(e)
-  x = vmaxq_f16(x, vdupq_n_f16(-14)); // clamp under with -14
-  x = vminq_f16(x, vdupq_n_f16(14)); // clamp over with 14
+  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
+  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
+  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14

-  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(0.5)));
+  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
  float16x8_t fpart = vsubq_f16(x, ipart);

-  x = vdupq_n_f16(1.535336188319500e-4f);
-  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(9.618437357674640e-3f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(5.550332471162809e-2f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(2.402264791363012e-1f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(6.931472028550421e-1f), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(1.000000000000000f), x, fpart);
+  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);

  // generate 2**ipart in the floating point representation using integer
  // bitshifting
@@ -107,53 +111,6 @@ inline float16_t neon_reduce_add(float16x8_t x) {
  return vget_lane_f16(y, 0);
 }

-template <typename T, typename VT>
-struct AccelerateSimdOps {
-  VT init(T a) {
-    return a;
-  }
-
-  VT load(const T* a) {
-    return *(VT*)a;
-  }
-
-  void store(T* dst, VT x) {
-    *(VT*)dst = x;
-  }
-
-  VT max(VT a, VT b) {
-    return simd_max(a, b);
-  };
-
-  VT exp(VT x) {
-    return simd_fast_exp(x);
-  }
-
-  VT add(VT a, VT b) {
-    return a + b;
-  }
-
-  VT sub(VT a, T b) {
-    return a - b;
-  }
-
-  VT mul(VT a, VT b) {
-    return a * b;
-  }
-
-  VT mul(VT a, T b) {
-    return a * b;
-  }
-
-  T reduce_max(VT x) {
-    return simd_reduce_max(x);
-  }
-
-  T reduce_add(VT x) {
-    return simd_reduce_add(x);
-  }
-};
-
 template <typename T, typename VT>
 struct NeonFp16SimdOps {
  VT init(T a) {
@@ -170,7 +127,7 @@ struct NeonFp16SimdOps {

  VT max(VT a, VT b) {
    return vmaxq_f16(a, b);
-  };
+  }

  VT exp(VT x) {
    return neon_fast_exp(x);
@@ -201,6 +158,55 @@ struct NeonFp16SimdOps {
  }
 };

+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T, typename VT>
+struct AccelerateSimdOps {
+  VT init(T a) {
+    return a;
+  }
+
+  VT load(const T* a) {
+    return *(VT*)a;
+  }
+
+  void store(T* dst, VT x) {
+    *(VT*)dst = x;
+  }
+
+  VT max(VT a, VT b) {
+    return simd_max(a, b);
+  }
+
+  VT exp(VT x) {
+    return simd_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return a + b;
+  }
+
+  VT sub(VT a, T b) {
+    return a - b;
+  }
+
+  VT mul(VT a, VT b) {
+    return a * b;
+  }
+
+  VT mul(VT a, T b) {
+    return a * b;
+  }
+
+  T reduce_max(VT x) {
+    return simd_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return simd_reduce_add(x);
+  }
+};
+
 template <typename T, typename AccT, typename VT, typename Ops, int N>
 void softmax(const array& in, array& out) {
  Ops ops;
@@ -362,12 +368,16 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
            AccelerateSimdOps<float, simd_float16>,
            16>(in, out);
      } else {
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        softmax<
            float16_t,
            float16_t,
            float16x8_t,
            NeonFp16SimdOps<float16_t, float16x8_t>,
            8>(in, out);
+#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        eval(inputs, out); // Redirect to common backend for consistency
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
      }
      break;
    case bfloat16:
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@@ -1,8 +1,8 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #pragma once

-#include <vecLib/BNNS/bnns.h>
+#include <Accelerate/Accelerate.h>
 #include "mlx/dtype.h"

 namespace mlx::core {
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -42,12 +42,15 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -196,6 +196,20 @@ void LogAddExp::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, detail::LogicalAnd());
+}
+
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalOr requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, detail::LogicalOr());
+}
+
 void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -66,7 +66,7 @@ void Copy::eval(const std::vector<array>& inputs, array& out) {
  out.copy_shared_buffer(inputs[0]);
 }

-void CustomVJP::eval(
+void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
@@ -250,49 +250,6 @@ void Split::eval(
  }
 }

-std::tuple<bool, int64_t, std::vector<int64_t>> Slice::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  bool copy_needed = false;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-
-    copy_needed |= strides_[i] < 0;
-  }
-
-  return std::make_tuple(copy_needed, data_offset, inp_strides);
-}
-
-void Slice::shared_buffer_slice(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    size_t data_offset,
-    array& out) {
-  // Compute row/col contiguity
-  auto [data_size, is_row_contiguous, is_col_contiguous] =
-      check_contiguity(out.shape(), out_strides);
-
-  auto flags = in.flags();
-  flags.row_contiguous = is_row_contiguous;
-  flags.col_contiguous = is_col_contiguous;
-
-  if (data_size == 1) {
-    // Broadcasted scalar array is contiguous.
-    flags.contiguous = true;
-  } else if (data_size == in.data_size()) {
-    // Means we sliced a broadcasted dimension so leave the "no holes" flag
-    // alone.
-  } else {
-    // We sliced something. So either we are row or col contiguous or we
-    // punched a hole.
-    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
-  }
-
-  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
-}
-
 std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
    const array& in) {
  int64_t data_offset = 0;
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -205,8 +205,8 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Correct size
      // - Not a constant
-      if (in.flags().row_contiguous && in.nbytes() == outputs[o].nbytes() &&
-          in.is_donatable() &&
+      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
+          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
        if (move_buffers) {
          outputs[o].move_shared_buffer(
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -4,6 +4,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -142,29 +143,31 @@ void copy_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    int64_t i_offset) {
-  switch (src.ndim()) {
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
+      data_shape, std::vector<std::vector<stride_t>>{i_strides});
+  switch (new_shape.size()) {
    case 1:
      copy_general_dim1<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 2:
      copy_general_dim2<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 3:
      copy_general_dim3<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 4:
      copy_general_dim4<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
  }

  auto src_ptr = src.data<SrcT>() + i_offset;
  auto dst_ptr = dst.data<DstT>();
  for (size_t i = 0; i < dst.size(); ++i) {
-    stride_t src_elem = elem_to_loc(i, data_shape, i_strides);
+    stride_t src_elem = elem_to_loc(i, new_shape, new_strides[0]);
    dst_ptr[i] = static_cast<DstT>(src_ptr[src_elem]);
  }
 }
@@ -195,10 +198,10 @@ inline void copy_general_general_dims(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    stride_t i_offset,
-    stride_t o_offset) {
+    int64_t i_offset,
+    int64_t o_offset) {
  if constexpr (D > 1) {
-    int axis = src.ndim() - D;
+    int axis = data_shape.size() - D;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -209,7 +212,7 @@ inline void copy_general_general_dims(
      o_offset += stride_dst;
    }
  } else {
-    int axis = src.ndim() - 1;
+    int axis = data_shape.size() - 1;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -230,38 +233,76 @@ void copy_general_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    stride_t i_offset,
-    stride_t o_offset) {
-  switch (src.ndim()) {
+    int64_t i_offset,
+    int64_t o_offset) {
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
+      data_shape, std::vector<std::vector<stride_t>>{i_strides, o_strides});
+  switch (new_shape.size()) {
    case 1:
      copy_general_general_dims<SrcT, DstT, stride_t, 1>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
+          dst,
+          new_shape,
+          new_strides[0],
+          new_strides[1],
+          i_offset,
+          o_offset);
      return;
    case 2:
      copy_general_general_dims<SrcT, DstT, stride_t, 2>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
+          dst,
+          new_shape,
+          new_strides[0],
+          new_strides[1],
+          i_offset,
+          o_offset);
      return;
    case 3:
      copy_general_general_dims<SrcT, DstT, stride_t, 3>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
+          dst,
+          new_shape,
+          new_strides[0],
+          new_strides[1],
+          i_offset,
+          o_offset);
      return;
    case 4:
      copy_general_general_dims<SrcT, DstT, stride_t, 4>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
+          dst,
+          new_shape,
+          new_strides[0],
+          new_strides[1],
+          i_offset,
+          o_offset);
      return;
    case 5:
      copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
+          dst,
+          new_shape,
+          new_strides[0],
+          new_strides[1],
+          i_offset,
+          o_offset);
      return;
  }

  int size = std::accumulate(
-      data_shape.end() - 5, data_shape.end(), 1, std::multiplies<int>());
+      new_shape.end() - 5, new_shape.end(), 1, std::multiplies<int>());
  for (int i = 0; i < src.size(); i += size) {
-    stride_t src_offset = i_offset + elem_to_loc(i, data_shape, i_strides);
-    stride_t dst_offset = o_offset + elem_to_loc(i, dst.shape(), o_strides);
+    stride_t src_offset = i_offset + elem_to_loc(i, new_shape, new_strides[0]);
+    stride_t dst_offset = o_offset + elem_to_loc(i, new_shape, new_strides[1]);
    copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-        src, dst, data_shape, i_strides, o_strides, src_offset, dst_offset);
+        src,
+        dst,
+        new_shape,
+        new_strides[0],
+        new_strides[1],
+        src_offset,
+        dst_offset);
  }
 }

@@ -444,8 +485,17 @@ void copy_inplace(
  }
 }

-template <>
-void copy_inplace<int64_t>(
+template void copy_inplace<size_t>(
+    const array& src,
+    array& dst,
+    const std::vector<int>& data_shape,
+    const std::vector<size_t>& i_strides,
+    const std::vector<size_t>& o_strides,
+    int64_t i_offset,
+    int64_t o_offset,
+    CopyType ctype);
+
+template void copy_inplace<int64_t>(
    const array& src,
    array& dst,
    const std::vector<int>& data_shape,
@@ -453,24 +503,6 @@ void copy_inplace<int64_t>(
    const std::vector<int64_t>& o_strides,
    int64_t i_offset,
    int64_t o_offset,
-    CopyType ctype) {
-  switch (ctype) {
-    case CopyType::General:
-    case CopyType::GeneralGeneral:
-      return copy_inplace_dispatch(
-          src,
-          dst,
-          ctype,
-          data_shape,
-          i_strides,
-          o_strides,
-          i_offset,
-          o_offset);
-
-    case CopyType::Scalar:
-    case CopyType::Vector:
-      return copy_inplace_dispatch(src, dst, ctype);
-  }
-}
+    CopyType ctype);

 } // namespace mlx::core
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -5,7 +5,6 @@
 #else
 #include <cblas.h>
 #endif
-
 #include <cstring>

 #include "mlx/array.h"
@@ -53,7 +52,7 @@ DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
 DEFAULT(Cosh)
-DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(CustomTransforms)
 DEFAULT_MULTI(Depends)
 DEFAULT(Divide)
 DEFAULT(NumberOfElements)
@@ -69,6 +68,7 @@ DEFAULT(Full)
 DEFAULT(Gather)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
--- a/mlx/backend/common/hadamard.cpp
+++ b/mlx/backend/common/hadamard.cpp
@@ -0,0 +1,107 @@
+// Copyright © 2024 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/hadamard.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+// n = 2^k component
+template <typename T>
+void hadamard_n(array& out, int n, int m, float scale) {
+  for (int b = 0; b < out.size() / n; b++) {
+    size_t loc = b * n;
+    T* data_ptr = out.data<T>() + loc;
+    int h = 1;
+    int n_over_2 = n / 2;
+    while (h < n) {
+      for (int i = 0; i < n / 2; i++) {
+        int k = i & (h - 1);
+        int j = ((i - k) << 1) + k;
+        float x = *(data_ptr + j);
+        float y = *(data_ptr + j + h);
+        *(data_ptr + j) = x + y;
+        *(data_ptr + j + h) = x - y;
+        if (h == n_over_2) {
+          *(data_ptr + j) *= scale;
+          *(data_ptr + j + h) *= scale;
+        }
+      }
+      h <<= 1;
+    }
+  }
+}
+
+// m component
+template <typename T>
+void hadamard_m(array& out, int n, int m, float scale) {
+  auto h_matrices = hadamard_matrices();
+  auto& matrix = h_matrices[m];
+  auto start = 1;
+  auto end = matrix.find('\n', start);
+  std::vector<bool> hmat_vec;
+  while (end != std::string_view::npos) {
+    auto row = matrix.substr(start, end - start);
+    for (int i = 0; i < row.length(); i++) {
+      hmat_vec.push_back(row[i] == '+');
+    }
+    start = end + 1;
+    end = matrix.find('\n', start);
+  }
+
+  for (int b = 0; b < out.size() / m / n; b++) {
+    size_t loc = b * n * m;
+    T* data_ptr = out.data<T>() + loc;
+    for (int i = 0; i < n; i++) {
+      std::vector<float> out(m);
+      for (int j = 0; j < m; j++) {
+        for (int k = 0; k < m; k++) {
+          float x = *(data_ptr + i + k * n);
+          if (hmat_vec[k + j * m]) {
+            out[j] += x;
+          } else {
+            out[j] -= x;
+          }
+        }
+      }
+      for (int j = 0; j < m; j++) {
+        *(data_ptr + i + j * n) = out[j] * scale;
+      }
+    }
+  }
+}
+
+template <typename T>
+void hadamard(array& out, int n, int m, float scale) {
+  float n_scale = m > 1 ? 1.0 : scale;
+  hadamard_n<T>(out, n, m, n_scale);
+  if (m > 1) {
+    hadamard_m<T>(out, n, m, scale);
+  }
+}
+
+void Hadamard::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  // Copy input to output
+  copy(in, out, CopyType::General);
+
+  int axis = out.ndim() - 1;
+  auto [n, m] = decompose_hadamard(out.shape(axis));
+
+  switch (in.dtype()) {
+    case float32:
+      return hadamard<float>(out, n, m, scale_);
+    case float16:
+      return hadamard<float16_t>(out, n, m, scale_);
+    case bfloat16:
+      return hadamard<bfloat16_t>(out, n, m, scale_);
+    default:
+      throw std::invalid_argument("[hadamard] Unsupported type.");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -0,0 +1,105 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <map>
+
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+// From http://neilsloane.com/hadamard/
+constexpr std::string_view h12 = R"(
+-++++++++++
+--+-+-+-+-+-
+++-++----++
+---+--+-++-
+++++-++----
+-+---+--+-+
++--+++-++--
+--++---+--+
++----+++-++
+--+-++---+-
++++----+++-
+-+--+-++---
+)";
+
+constexpr std::string_view h20 = R"(
+----+----++--++-++-
+-+----+---+++---+-++
+--+----+---+++-+-+-+
+---+----+---+++++-+-
+----+----++--++-++-+
+-+++++-----+--+++--+
+-+++-+---+-+--+++--
++-++--+---+-+--+++-
+++-+---+---+-+--+++
++++-----++--+-+--++
+--++-+-++-+-----++++
+---++-+-++-+---+-+++
+---++-+-+--+--++-++
++---++-+----+-+++-+
+-++---++-+----+++++-
+-+--+--++-+----+----
+-+-----++-+----+---
+-+-+-+---+--+----+--
+--+-+++------+----+-
+--+--++------+----+
+)";
+
+constexpr std::string_view h28 = R"(
+------++----++-+--+-+--++--
+-+-----+++-----+-+--+-+--++-
+--+-----+++---+-+-+----+--++
+---+-----+++---+-+-+-+--+--+
+----+-----+++---+-+-+++--+--
+-----+-----++++--+-+--++--+-
+------++----++-+--+-+--++--+
+--++++-+-------++--+++-+--+-
+---++++-+-----+-++--+-+-+--+
+---+++--+----++-++--+-+-+--
++---++---+----++-++--+-+-+-
+++---+----+----++-++--+-+-+
++++--------+-+--++-++--+-+-
+-++++--------+++--++--+--+-+
+-+-++-++--++--+--------++++-
+-+-++--+--++--+--------++++
+-+-+-++--+--++--+----+---+++
+-+-+-++--+--+---+---++---++
++-+-+-++--+------+--+++---+
+-++-+-+-++--+------+-++++---
+-++-+---++--+------+-++++--
+-++--++-+-++-+++----++------
+-++--++-+-++-+++-----+-----
++-++---+-+-++-+++-----+----
+-++-++-+-+-+-+--+++-----+---
+--++-++++-+-+----+++-----+--
+--++-+-++-+-+----+++-----+-
++--++-+-++-+-+----++------+
+)";
+
+inline const std::map<int, std::string_view> hadamard_matrices() {
+  return {{12, h12}, {20, h20}, {28, h28}};
+}
+
+inline std::pair<int, int> decompose_hadamard(int n) {
+  // n = m*2^k
+  int m = 1;
+  if (!is_power_of_2(n)) {
+    auto h_matrices = hadamard_matrices();
+    for (auto [factor, _] : h_matrices) {
+      if (n % factor == 0) {
+        m = factor;
+        n /= factor;
+        break;
+      }
+    }
+    if (m == 1) {
+      throw std::invalid_argument(
+          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
+    }
+  }
+  return {n, m};
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -108,105 +108,105 @@ struct Abs {
  template <typename T>
  T operator()(T x) {
    return std::abs(x);
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
-  };
+  }
 };

 struct ArcCos {
  template <typename T>
  T operator()(T x) {
    return std::acos(x);
-  };
+  }
 };

 struct ArcCosh {
  template <typename T>
  T operator()(T x) {
    return std::acosh(x);
-  };
+  }
 };

 struct ArcSin {
  template <typename T>
  T operator()(T x) {
    return std::asin(x);
-  };
+  }
 };

 struct ArcSinh {
  template <typename T>
  T operator()(T x) {
    return std::asinh(x);
-  };
+  }
 };

 struct ArcTan {
  template <typename T>
  T operator()(T x) {
    return std::atan(x);
-  };
+  }
 };

 struct ArcTan2 {
  template <typename T>
  T operator()(T y, T x) {
    return std::atan2(y, x);
-  };
+  }
 };

 struct ArcTanh {
  template <typename T>
  T operator()(T x) {
    return std::atanh(x);
-  };
+  }
 };

 struct Ceil {
  template <typename T>
  T operator()(T x) {
    return std::ceil(x);
-  };
+  }
  int8_t operator()(int8_t x) {
    return x;
-  };
+  }
  int16_t operator()(int16_t x) {
    return x;
-  };
+  }
  int32_t operator()(int32_t x) {
    return x;
-  };
+  }
  int64_t operator()(int64_t x) {
    return x;
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
-  };
+  }
 };

 struct Conjugate {
@@ -219,35 +219,35 @@ struct Cos {
  template <typename T>
  T operator()(T x) {
    return std::cos(x);
-  };
+  }
 };

 struct Cosh {
  template <typename T>
  T operator()(T x) {
    return std::cosh(x);
-  };
+  }
 };

 struct Erf {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erf(static_cast<float>(x)));
-  };
+  }
 };

 struct ErfInv {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erfinv(static_cast<float>(x)));
-  };
+  }
 };

 struct Exp {
  template <typename T>
  T operator()(T x) {
    return fast_exp(x);
-  };
+  }

  complex64_t operator()(complex64_t x) {
    return std::exp(x);
@@ -258,83 +258,83 @@ struct Expm1 {
  template <typename T>
  T operator()(T x) {
    return expm1(x);
-  };
+  }
 };

 struct Floor {
  template <typename T>
  T operator()(T x) {
    return std::floor(x);
-  };
+  }
  int8_t operator()(int8_t x) {
    return x;
-  };
+  }
  int16_t operator()(int16_t x) {
    return x;
-  };
+  }
  int32_t operator()(int32_t x) {
    return x;
-  };
+  }
  int64_t operator()(int64_t x) {
    return x;
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
-  };
+  }
 };

 struct Log {
  template <typename T>
  T operator()(T x) {
    return std::log(x);
-  };
+  }
 };

 struct Log2 {
  template <typename T>
  T operator()(T x) {
    return std::log2(x);
-  };
+  }
 };

 struct Log10 {
  template <typename T>
  T operator()(T x) {
    return std::log10(x);
-  };
+  }
 };

 struct Log1p {
  template <typename T>
  T operator()(T x) {
    return log1p(x);
-  };
+  }
 };

 struct LogicalNot {
  template <typename T>
  T operator()(T x) {
    return !x;
-  };
+  }
 };

 struct Negative {
  template <typename T>
  T operator()(T x) {
    return -x;
-  };
+  }
 };

 struct Round {
@@ -379,49 +379,49 @@ struct Sin {
  template <typename T>
  T operator()(T x) {
    return std::sin(x);
-  };
+  }
 };

 struct Sinh {
  template <typename T>
  T operator()(T x) {
    return std::sinh(x);
-  };
+  }
 };

 struct Square {
  template <typename T>
  T operator()(T x) {
    return x * x;
-  };
+  }
 };

 struct Sqrt {
  template <typename T>
  T operator()(T x) {
    return std::sqrt(x);
-  };
+  }
 };

 struct Rsqrt {
  template <typename T>
  T operator()(T x) {
    return static_cast<decltype(x)>(1.0) / std::sqrt(x);
-  };
+  }
 };

 struct Tan {
  template <typename T>
  T operator()(T x) {
    return std::tan(x);
-  };
+  }
 };

 struct Tanh {
  template <typename T>
  T operator()(T x) {
    return std::tanh(x);
-  };
+  }
 };

 struct Add {
@@ -554,7 +554,7 @@ struct LogAddExp {
        ? maxval
        : static_cast<decltype(x)>(
              maxval + std::log1p(fast_exp(minval - maxval)));
-  };
+  }
 };

 struct Multiply {
@@ -602,14 +602,14 @@ struct LogicalAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x && y;
-  };
+  }
 };

 struct LogicalOr {
  template <typename T>
  T operator()(T x, T y) {
    return x || y;
-  };
+  }
 };

 struct Select {
@@ -623,35 +623,35 @@ struct BitwiseAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x & y;
-  };
+  }
 };

 struct BitwiseOr {
  template <typename T>
  T operator()(T x, T y) {
    return x | y;
-  };
+  }
 };

 struct BitwiseXor {
  template <typename T>
  T operator()(T x, T y) {
    return x ^ y;
-  };
+  }
 };

 struct LeftShift {
  template <typename T>
  T operator()(T x, T y) {
    return x << y;
-  };
+  }
 };

 struct RightShift {
  template <typename T>
  T operator()(T x, T y) {
    return x >> y;
-  };
+  }
 };

 } // namespace mlx::core::detail
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -8,9 +8,9 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/arange.h"
-#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/ops.h"
+#include "mlx/backend/common/slicing.h"
 #include "mlx/backend/common/threefry.h"
 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/common/utils.h"
@@ -313,20 +313,6 @@ void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
  unary(in, out, detail::LogicalNot());
 }

-void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
-  auto& in1 = inputs[0];
-  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalAnd());
-}
-
-void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2); // LogicalOr requires two input arrays
-  auto& in1 = inputs[0];
-  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalOr());
-}
-
 void Negative::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -419,7 +405,17 @@ void Reshape::eval(const std::vector<array>& inputs, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);

  if (copy_necessary) {
-    copy(in, out, in.data_size() == 1 ? CopyType::Scalar : CopyType::General);
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    auto out_strides = make_contiguous_strides<size_t>(in.shape());
+    copy_inplace<size_t>(
+        in,
+        out,
+        in.shape(),
+        in.strides(),
+        out_strides,
+        0,
+        0,
+        CopyType::General);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -492,7 +488,8 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] = prepare_slice(in);
+  auto [copy_needed, data_offset, inp_strides] =
+      prepare_slice(in, start_indices_, strides_);

  // Do copy if needed
  if (copy_needed) {
@@ -590,4 +587,36 @@ void Tanh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void View::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  auto ibytes = size_of(in.dtype());
+  auto obytes = size_of(out.dtype());
+  // Conditions for buffer copying (disjunction):
+  // - type size is the same
+  // - type size is smaller and the last axis is contiguous
+  // - the entire array is row contiguous
+  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
+      in.flags().row_contiguous) {
+    auto strides = in.strides();
+    for (int i = 0; i < strides.size() - 1; ++i) {
+      strides[i] *= ibytes;
+      strides[i] /= obytes;
+    }
+    out.copy_shared_buffer(
+        in, strides, in.flags(), in.data_size() * obytes / ibytes);
+  } else {
+    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
+    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
+    copy_inplace(in, tmp, CopyType::General);
+
+    auto flags = out.flags();
+    flags.contiguous = true;
+    flags.row_contiguous = true;
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -104,48 +104,14 @@ void reduce_dispatch_out(
    }
    case Reduce::Sum: {
      auto op = [](auto y, auto x) { (*y) = (*y) + x; };
-      switch (out.dtype()) {
-        case bool_:
-          reduction_op<InT, bool>(in, out, axes, false, op);
-          break;
-        case uint8:
-          reduction_op<InT, uint8_t>(in, out, axes, 0, op);
-          break;
-        case uint16:
-          reduction_op<InT, uint16_t>(in, out, axes, 0, op);
-          break;
-        case uint32:
-          reduction_op<InT, uint32_t>(in, out, axes, 0, op);
-          break;
-        case uint64:
-          reduction_op<InT, uint64_t>(in, out, axes, 0, op);
-          break;
-        case int8:
-          reduction_op<InT, int8_t>(in, out, axes, 0, op);
-          break;
-        case int16:
-          reduction_op<InT, int16_t>(in, out, axes, 0, op);
-          break;
-        case int32:
-          reduction_op<InT, int32_t>(in, out, axes, 0, op);
-          break;
-        case int64:
-          reduction_op<InT, int64_t>(in, out, axes, 0, op);
-          break;
-        case float16:
-          reduction_op<InT, float16_t>(in, out, axes, 0.0f, op);
-          break;
-        case float32:
-          reduction_op<InT, float>(in, out, axes, 0.0f, op);
-          break;
-        case bfloat16:
-          reduction_op<InT, bfloat16_t>(in, out, axes, 0.0f, op);
-          break;
-        case complex64:
-          reduction_op<InT, complex64_t>(in, out, axes, complex64_t{0.0f}, op);
-          break;
+      if (out.dtype() == int32) {
+        // special case since the input type can be bool
+        reduction_op<InT, int32_t>(in, out, axes, 0, op);
+      } else {
+        reduction_op<InT, InT>(in, out, axes, 0, op);
      }
-    } break;
+      break;
+    }
    case Reduce::Prod: {
      auto op = [](auto y, auto x) { (*y) *= x; };
      reduction_op<InT, InT>(in, out, axes, 1, op);
@@ -168,6 +134,29 @@ void reduce_dispatch_out(

 } // namespace

+void nd_loop(
+    std::function<void(int)> callback,
+    const std::vector<int>& shape,
+    const std::vector<size_t>& strides) {
+  std::function<void(int, int)> loop_inner;
+  loop_inner = [&](int dim, int offset) {
+    if (dim < shape.size() - 1) {
+      int size = shape[dim];
+      size_t stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        loop_inner(dim + 1, offset + i * stride);
+      }
+    } else {
+      int size = shape[dim];
+      size_t stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        callback(offset + i * stride);
+      }
+    }
+  };
+  loop_inner(0, 0);
+}
+
 void Reduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -49,47 +49,18 @@ struct ReductionPlan {
  ReductionPlan(ReductionOpType type_) : type(type_) {}
 };

-namespace {
+ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes);

 // Helper for the ndimensional strided loop
 // Should this be in utils?
-inline void nd_loop(
+void nd_loop(
    std::function<void(int)> callback,
    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
-  std::function<void(int, int)> loop_inner;
-  loop_inner = [&](int dim, int offset) {
-    if (dim < shape.size() - 1) {
-      int size = shape[dim];
-      size_t stride = strides[dim];
-      for (int i = 0; i < size; i++) {
-        loop_inner(dim + 1, offset + i * stride);
-      }
-    } else {
-      int size = shape[dim];
-      size_t stride = strides[dim];
-      for (int i = 0; i < size; i++) {
-        callback(offset + i * stride);
-      }
-    }
-  };
-  loop_inner(0, 0);
-}
+    const std::vector<size_t>& strides);

 std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
    const array& x,
-    const std::vector<int>& axes) {
-  std::vector<int> shape = x.shape();
-  std::vector<size_t> strides = x.strides();
-
-  for (int i = axes.size() - 1; i >= 0; i--) {
-    int a = axes[i];
-    shape.erase(shape.begin() + a);
-    strides.erase(strides.begin() + a);
-  }
-
-  return std::make_pair(shape, strides);
-}
+    const std::vector<int>& axes);

 template <typename T, typename U, typename Op>
 struct DefaultStridedReduce {
@@ -123,102 +94,6 @@ struct DefaultContiguousReduce {
  }
 };

-ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
-  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
-      x.flags().contiguous) {
-    return ContiguousAllReduce;
-  }
-
-  // Row contiguous input so the output is row contiguous
-  if (x.flags().row_contiguous) {
-    // Merge consecutive axes
-    std::vector<int> shape = {x.shape(axes[0])};
-    std::vector<size_t> strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
-      if (axes[i] - 1 == axes[i - 1]) {
-        shape.back() *= x.shape(axes[i]);
-        strides.back() = x.strides()[axes[i]];
-      } else {
-        shape.push_back(x.shape(axes[i]));
-        strides.push_back(x.strides()[axes[i]]);
-      }
-    }
-
-    if (strides.back() == 1) {
-      return ReductionPlan(ContiguousReduce, shape, strides);
-    } else if (strides.back() > 1) {
-      return ReductionPlan(ContiguousStridedReduce, shape, strides);
-    }
-  }
-
-  // Let's check if we can optimize our access patterns
-  //
-  // 1. We have a reduction axis with stride 1. Simply call
-  //    GeneralContiguousReduce and be done with it.
-  // 2. We have transpositions and we are not reducing over the axis with
-  //    stride 1. However, we are reducing over an axis where everything is
-  //    contiguous in memory to the right of that axis. We can call strided
-  //    reduce and be done with it.
-  // 2. We have weird transpositions and expands. Copy the strides to the
-  //    output, then call strided reduce.
-
-  // Sort reduction axes by stride in order to merge them and figure out if we
-  // have a contiguous reduction.
-  std::vector<std::pair<int, size_t>> reductions;
-  for (auto a : axes) {
-    reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
-  }
-  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
-    return a.second > b.second;
-  });
-  // Extract the two smallest and try to merge them in case the contiguous
-  // reduction can be bigger than just the last axis.
-  for (int i = reductions.size() - 1; i >= 1; i--) {
-    auto a = reductions[i];
-    auto b = reductions[i - 1];
-
-    // b.stride = a.shape * a.stride then a and b are contiguous
-    if (b.second == a.first * a.second) {
-      reductions.erase(reductions.begin() + i);
-      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
-    }
-  }
-
-  std::vector<int> shape;
-  std::vector<size_t> strides;
-  for (auto r : reductions) {
-    shape.push_back(r.first);
-    strides.push_back(r.second);
-  }
-
-  // We can call the contiguous reduction op for every weird way the input is
-  // structured in the rest of the axes.
-  if (strides.back() == 1) {
-    return ReductionPlan(GeneralContiguousReduce, shape, strides);
-  }
-
-  // Delegate to the general strided reduction op if the axes after
-  // strides.back() are contiguous.
-  if (strides.back() > 1) {
-    int size = 1;
-    for (int i = x.ndim() - 1; i >= 0; i--) {
-      if (axes.back() == i) {
-        continue;
-      }
-      if (x.strides()[i] != size) {
-        break;
-      }
-      size *= x.shape(i);
-    }
-    if (size >= strides.back()) {
-      return ReductionPlan(GeneralStridedReduce, shape, strides);
-    }
-  }
-
-  return ReductionPlan(GeneralReduce, shape, strides);
-}
-
 template <typename T, typename U, typename OpS, typename OpC, typename Op>
 void reduction_op(
    const array& x,
@@ -361,6 +236,4 @@ void reduction_op(
  reduction_op<T, U>(x, out, axes, init, ops, opc, op);
 }

-} // namespace
-
 } // namespace mlx::core
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -0,0 +1,118 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/reduce.h"
+
+namespace mlx::core {
+
+std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  std::vector<int> shape = x.shape();
+  std::vector<size_t> strides = x.strides();
+
+  for (int i = axes.size() - 1; i >= 0; i--) {
+    int a = axes[i];
+    shape.erase(shape.begin() + a);
+    strides.erase(strides.begin() + a);
+  }
+
+  return std::make_pair(shape, strides);
+}
+
+ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
+  // The data is all there and we are reducing over everything
+  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+      x.flags().contiguous) {
+    return ContiguousAllReduce;
+  }
+
+  // Row contiguous input so the output is row contiguous
+  if (x.flags().row_contiguous) {
+    // Merge consecutive axes
+    std::vector<int> shape = {x.shape(axes[0])};
+    std::vector<size_t> strides = {x.strides()[axes[0]]};
+    for (int i = 1; i < axes.size(); i++) {
+      if (axes[i] - 1 == axes[i - 1]) {
+        shape.back() *= x.shape(axes[i]);
+        strides.back() = x.strides()[axes[i]];
+      } else {
+        shape.push_back(x.shape(axes[i]));
+        strides.push_back(x.strides()[axes[i]]);
+      }
+    }
+
+    if (strides.back() == 1) {
+      return ReductionPlan(ContiguousReduce, shape, strides);
+    } else if (strides.back() > 1) {
+      return ReductionPlan(ContiguousStridedReduce, shape, strides);
+    }
+  }
+
+  // Let's check if we can optimize our access patterns
+  //
+  // 1. We have a reduction axis with stride 1. Simply call
+  //    GeneralContiguousReduce and be done with it.
+  // 2. We have transpositions and we are not reducing over the axis with
+  //    stride 1. However, we are reducing over an axis where everything is
+  //    contiguous in memory to the right of that axis. We can call strided
+  //    reduce and be done with it.
+  // 2. We have weird transpositions and expands. Copy the strides to the
+  //    output, then call strided reduce.
+
+  // Sort reduction axes by stride in order to merge them and figure out if we
+  // have a contiguous reduction.
+  std::vector<std::pair<int, size_t>> reductions;
+  for (auto a : axes) {
+    reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
+  }
+  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
+    return a.second > b.second;
+  });
+  // Extract the two smallest and try to merge them in case the contiguous
+  // reduction can be bigger than just the last axis.
+  for (int i = reductions.size() - 1; i >= 1; i--) {
+    auto a = reductions[i];
+    auto b = reductions[i - 1];
+
+    // b.stride = a.shape * a.stride then a and b are contiguous
+    if (b.second == a.first * a.second) {
+      reductions.erase(reductions.begin() + i);
+      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
+    }
+  }
+
+  std::vector<int> shape;
+  std::vector<size_t> strides;
+  for (auto r : reductions) {
+    shape.push_back(r.first);
+    strides.push_back(r.second);
+  }
+
+  // We can call the contiguous reduction op for every weird way the input is
+  // structured in the rest of the axes.
+  if (strides.back() == 1) {
+    return ReductionPlan(GeneralContiguousReduce, shape, strides);
+  }
+
+  // Delegate to the general strided reduction op if the axes after
+  // strides.back() are contiguous.
+  if (strides.back() > 1) {
+    int size = 1;
+    for (int i = x.ndim() - 1; i >= 0; i--) {
+      if (axes.back() == i) {
+        continue;
+      }
+      if (x.strides()[i] != size) {
+        break;
+      }
+      size *= x.shape(i);
+    }
+    if (size >= strides.back()) {
+      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    }
+  }
+
+  return ReductionPlan(GeneralReduce, shape, strides);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/scan.cpp
+++ b/mlx/backend/common/scan.cpp
@@ -234,7 +234,7 @@ void scan_dispatch(
      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *y : *x; };
      auto init = (issubdtype(input.dtype(), floating))
          ? static_cast<U>(-std::numeric_limits<float>::infinity())
-          : std::numeric_limits<U>::max();
+          : std::numeric_limits<U>::min();
      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -0,0 +1,52 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+    const array& in,
+    std::vector<int>& start_indices,
+    std::vector<int>& strides) {
+  int64_t data_offset = 0;
+  bool copy_needed = false;
+  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  for (int i = 0; i < in.ndim(); ++i) {
+    data_offset += start_indices[i] * in.strides()[i];
+    inp_strides[i] = in.strides()[i] * strides[i];
+
+    copy_needed |= strides[i] < 0;
+  }
+
+  return std::make_tuple(copy_needed, data_offset, inp_strides);
+}
+
+void shared_buffer_slice(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    size_t data_offset,
+    array& out) {
+  // Compute row/col contiguity
+  auto [data_size, is_row_contiguous, is_col_contiguous] =
+      check_contiguity(out.shape(), out_strides);
+
+  auto flags = in.flags();
+  flags.row_contiguous = is_row_contiguous;
+  flags.col_contiguous = is_col_contiguous;
+
+  if (data_size == 1) {
+    // Broadcasted scalar array is contiguous.
+    flags.contiguous = true;
+  } else if (data_size == in.data_size()) {
+    // Means we sliced a broadcasted dimension so leave the "no holes" flag
+    // alone.
+  } else {
+    // We sliced something. So either we are row or col contiguous or we
+    // punched a hole.
+    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+  }
+
+  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -0,0 +1,20 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+    const array& in,
+    std::vector<int>& start_indices,
+    std::vector<int>& strides);
+
+void shared_buffer_slice(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    size_t data_offset,
+    array& out);
+
+} // namespace mlx::core
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -113,14 +113,14 @@ void sort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

-  auto remaining_shape = in.shape();
+  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);

-  auto remaining_strides = in.strides();
+  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = in.strides()[axis];
-  int axis_size = in.shape(axis);
+  size_t axis_stride = out.strides()[axis];
+  int axis_size = out.shape(axis);

  // Perform sorting in place
  for (int i = 0; i < n_rows; i++) {
@@ -143,34 +143,42 @@ void argsort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

-  auto remaining_shape = in.shape();
-  remaining_shape.erase(remaining_shape.begin() + axis);
+  auto in_remaining_shape = in.shape();
+  in_remaining_shape.erase(in_remaining_shape.begin() + axis);

-  auto remaining_strides = in.strides();
-  remaining_strides.erase(remaining_strides.begin() + axis);
+  auto in_remaining_strides = in.strides();
+  in_remaining_strides.erase(in_remaining_strides.begin() + axis);

-  size_t axis_stride = in.strides()[axis];
+  auto out_remaining_shape = out.shape();
+  out_remaining_shape.erase(out_remaining_shape.begin() + axis);
+
+  auto out_remaining_strides = out.strides();
+  out_remaining_strides.erase(out_remaining_strides.begin() + axis);
+
+  size_t in_stride = in.strides()[axis];
+  size_t out_stride = out.strides()[axis];
  int axis_size = in.shape(axis);

  // Perform sorting
  for (int i = 0; i < n_rows; i++) {
-    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
-    const T* data_ptr = in.data<T>() + loc;
-    IdxT* idx_ptr = out.data<IdxT>() + loc;
+    size_t in_loc = elem_to_loc(i, in_remaining_shape, in_remaining_strides);
+    size_t out_loc = elem_to_loc(i, out_remaining_shape, out_remaining_strides);
+    const T* data_ptr = in.data<T>() + in_loc;
+    IdxT* idx_ptr = out.data<IdxT>() + out_loc;

-    StridedIterator st_(idx_ptr, axis_stride, 0);
-    StridedIterator ed_(idx_ptr, axis_stride, axis_size);
+    StridedIterator st_(idx_ptr, out_stride, 0);
+    StridedIterator ed_(idx_ptr, out_stride, axis_size);

    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));

    // Sort according to vals
-    StridedIterator st(idx_ptr, axis_stride, 0);
-    StridedIterator ed(idx_ptr, axis_stride, axis_size);
+    StridedIterator st(idx_ptr, out_stride, 0);
+    StridedIterator ed(idx_ptr, out_stride, axis_size);

-    std::stable_sort(st, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
-      auto v1 = data_ptr[a * axis_stride];
-      auto v2 = data_ptr[b * axis_stride];
+    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
+      auto v1 = data_ptr[a * in_stride];
+      auto v2 = data_ptr[b * in_stride];
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -29,6 +29,15 @@ inline size_t elem_to_loc(int elem, const array& a) {
  return elem_to_loc(elem, a.shape(), a.strides());
 }

+template <typename stride_t>
+std::vector<stride_t> make_contiguous_strides(const std::vector<int>& shape) {
+  std::vector<stride_t> strides(shape.size(), 1);
+  for (int i = shape.size() - 1; i > 0; i--) {
+    strides[i - 1] = strides[i] * shape[i];
+  }
+  return strides;
+}
+
 // Collapse dims that are contiguous to possibly route to a better kernel
 // e.g. for x = transpose(array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2}), {2, 0, 1})
 // should return {{2, 4}, {{1, 2}}}.
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -18,7 +18,7 @@ function(make_jit_source SRC_FILE)
              ${CMAKE_C_COMPILER}
              ${PROJECT_SOURCE_DIR}
              ${SRC_FILE}
-              "-D${MLX_METAL_VERSION}"
+              "-DMLX_METAL_VERSION=${MLX_METAL_VERSION}"
    DEPENDS make_compiled_preamble.sh
            kernels/${SRC_FILE}.h
            ${ARGN}
@@ -52,6 +52,7 @@ make_jit_source(
 )
 make_jit_source(scatter)
 make_jit_source(gather)
+make_jit_source(hadamard)

 if (MLX_METAL_JIT) 
  target_sources(
@@ -64,6 +65,11 @@ if (MLX_METAL_JIT)
  make_jit_source(unary)
  make_jit_source(binary)
  make_jit_source(binary_two)
+  make_jit_source(
+    fft
+    kernels/fft/radix.h
+    kernels/fft/readwrite.h
+  )
  make_jit_source(ternary)
  make_jit_source(softmax)
  make_jit_source(scan)
@@ -107,6 +113,7 @@ if (MLX_METAL_JIT)
    kernels/steel/defines.h
    kernels/steel/conv/loaders/loader_general.h
  )
+  make_jit_source(quantized)
 else()
  target_sources(
    mlx
@@ -126,6 +133,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
@@ -135,6 +143,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -6,20 +6,29 @@
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"

+#define BINARY_GPU(func)                                              \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    binary_op_gpu(inputs, out, get_primitive_string(this));           \
+  }
+
+#define BINARY_GPU_MULTI(func)                                         \
+  void func::eval_gpu(                                                 \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    binary_op_gpu(inputs, outputs, get_primitive_string(this));        \
+  }
+
 namespace mlx::core {

 constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;

-void binary_op(
+void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
-    const std::string op) {
-  assert(inputs.size() == 2);
+    const std::string op,
+    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, outputs[0], bopt, true);
-  set_binary_op_output_data(a, b, outputs[1], bopt, true);

  auto& out = outputs[0];
  if (out.size() == 0) {
@@ -61,10 +70,10 @@ void binary_op(
    kernel_name = kname.str();
  }

-  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);

-  auto kernel = get_binary_two_kernel(d, kernel_name, a, outputs[0]);
+  auto kernel =
+      get_binary_two_kernel(d, kernel_name, a.dtype(), outputs[0].dtype(), op);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
@@ -120,15 +129,36 @@ void binary_op(
  }
 }

-void binary_op(
+void binary_op_gpu(
    const std::vector<array>& inputs,
-    array& out,
-    const std::string op) {
+    std::vector<array>& outputs,
+    const std::string op,
+    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt, true);
+  set_binary_op_output_data(a, b, outputs[0], bopt, true);
+  set_binary_op_output_data(a, b, outputs[1], bopt, true);
+  binary_op_gpu_inplace(inputs, outputs, op, s);
+}
+
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const std::string op) {
+  auto& s = outputs[0].primitive().stream();
+  binary_op_gpu(inputs, outputs, op, s);
+}
+
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
  if (out.size() == 0) {
    return;
  }
@@ -168,10 +198,9 @@ void binary_op(
    kernel_name = kname.str();
  }

-  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);

-  auto kernel = get_binary_kernel(d, kernel_name, a, out);
+  auto kernel = get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  bool donate_a = a.data_shared_ptr() == nullptr;
@@ -221,102 +250,65 @@ void binary_op(
  }
 }

-void Add::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "add");
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt, true);
+  binary_op_gpu_inplace(inputs, out, op, s);
 }

-void ArcTan2::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "arctan2");
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op) {
+  auto& s = out.primitive().stream();
+  binary_op_gpu(inputs, out, op, s);
 }

+BINARY_GPU(Add)
+BINARY_GPU(ArcTan2)
+BINARY_GPU(Divide)
+BINARY_GPU_MULTI(DivMod)
+BINARY_GPU(Remainder)
+BINARY_GPU(Equal)
+BINARY_GPU(Greater)
+BINARY_GPU(GreaterEqual)
+BINARY_GPU(Less)
+BINARY_GPU(LessEqual)
+BINARY_GPU(LogicalAnd)
+BINARY_GPU(LogicalOr)
+BINARY_GPU(LogAddExp)
+BINARY_GPU(Maximum)
+BINARY_GPU(Minimum)
+BINARY_GPU(Multiply)
+BINARY_GPU(NotEqual)
+BINARY_GPU(Power)
+BINARY_GPU(Subtract)
+
 void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  switch (op_) {
    case BitwiseBinary::And:
-      binary_op(inputs, out, "bitwise_and");
+      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::Or:
-      binary_op(inputs, out, "bitwise_or");
+      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::Xor:
-      binary_op(inputs, out, "bitwise_xor");
+      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::LeftShift:
-      binary_op(inputs, out, "left_shift");
+      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::RightShift:
-      binary_op(inputs, out, "right_shift");
+      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
  }
 }

-void Divide::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "div");
-}
-
-void DivMod::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  binary_op(inputs, outputs, "divmod");
-}
-
-void Remainder::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "rem");
-}
-
-void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, equal_nan_ ? "naneq" : "eq");
-}
-
-void Greater::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "ge");
-}
-
-void GreaterEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "geq");
-}
-
-void Less::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "le");
-}
-
-void LessEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "leq");
-}
-
-void LogicalAnd::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "land");
-}
-
-void LogicalOr::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "lor");
-}
-
-void LogAddExp::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "lae");
-}
-
-void Maximum::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "max");
-}
-
-void Minimum::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "min");
-}
-
-void Multiply::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "mul");
-}
-
-void NotEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "neq");
-}
-
-void Power::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "pow");
-}
-
-void Subtract::eval_gpu(const std::vector<array>& inputs, array& out) {
-  binary_op(inputs, out, "sub");
-}
-
 } // namespace mlx::core
--- a/mlx/backend/metal/binary.h
+++ b/mlx/backend/metal/binary.h
@@ -0,0 +1,33 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const std::string op,
+    const Stream& s);
+
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s);
+
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const std::string op,
+    const Stream& s);
+
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s);
+
+} // namespace mlx::core
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -33,9 +33,6 @@ void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
-  if (out.size() == 0) {
-    return;
-  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
@@ -57,6 +54,10 @@ void copy_gpu_inplace(
    int64_t out_offset,
    CopyType ctype,
    const Stream& s) {
+  if (out.size() == 0) {
+    return;
+  }
+
  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(
      data_shape, std::vector{strides_in_pre, strides_out_pre});
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -30,7 +30,9 @@ constexpr int MAX_DISPATCHES_PER_ENCODER = 2;
 constexpr const char* default_mtllib_path = METAL_PATH;

 constexpr auto get_metal_version() {
-#if defined METAL_3_1
+#if (MLX_METAL_VERSION >= 320)
+  return MTL::LanguageVersion3_2;
+#elif (MLX_METAL_VERSION >= 310)
  return MTL::LanguageVersion3_1;
 #else
  return MTL::LanguageVersion3_0;
--- a/mlx/backend/metal/fft.cpp
+++ b/mlx/backend/metal/fft.cpp
@@ -1,106 +1,806 @@
 // Copyright © 2023 Apple Inc.
+#include <cassert>
+#include <complex>
+#include <map>
+#include <numeric>
+#include <set>
+
+#include "mlx/3rdparty/pocketfft.h"
+#include "mlx/backend/metal/binary.h"
 #include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/slicing.h"
+#include "mlx/backend/metal/unary.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/mlx.h"
 #include "mlx/primitives.h"
+#include "mlx/utils.h"

 namespace mlx::core {

-void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
-  auto& s = out.primitive().stream();
-  auto& d = metal::device(s.device);
+using MTLFC = std::tuple<const void*, MTL::DataType, NS::UInteger>;

-  auto& in = inputs[0];
+#define MAX_STOCKHAM_FFT_SIZE 4096
+#define MAX_RADER_FFT_SIZE 2048
+#define MAX_BLUESTEIN_FFT_SIZE 2048
+// Threadgroup memory batching improves throughput for small n
+#define MIN_THREADGROUP_MEM_SIZE 256
+// For strided reads/writes, coalesce at least this many complex64s
+#define MIN_COALESCE_WIDTH 4

-  if (axes_.size() == 0 || axes_.size() > 1 || inverse_ ||
-      in.dtype() != complex64 || out.dtype() != complex64) {
-    // Could also fallback to CPU implementation here.
-    throw std::runtime_error(
-        "GPU FFT is only implemented for 1D, forward, complex FFTs.");
+inline const std::vector<int> supported_radices() {
+  // Ordered by preference in decomposition.
+  return {13, 11, 8, 7, 6, 5, 4, 3, 2};
+}
+
+std::vector<int> prime_factors(int n) {
+  int z = 2;
+  std::vector<int> factors;
+  while (z * z <= n) {
+    if (n % z == 0) {
+      factors.push_back(z);
+      n /= z;
+    } else {
+      z++;
+    }
+  }
+  if (n > 1) {
+    factors.push_back(n);
+  }
+  return factors;
+}
+
+struct FourStepParams {
+  bool required = false;
+  bool first_step = true;
+  int n1 = 0;
+  int n2 = 0;
+};
+
+// Forward Declaration
+void fft_op(
+    const array& in,
+    array& out,
+    size_t axis,
+    bool inverse,
+    bool real,
+    const FourStepParams four_step_params,
+    bool inplace,
+    const Stream& s);
+
+struct FFTPlan {
+  int n = 0;
+  // Number of steps for each radix in the Stockham decomposition
+  std::vector<int> stockham;
+  // Number of steps for each radix in the Rader decomposition
+  std::vector<int> rader;
+  // Rader factor, 1 if no rader factors
+  int rader_n = 1;
+  int bluestein_n = -1;
+  // Four step FFT
+  bool four_step = false;
+  int n1 = 0;
+  int n2 = 0;
+};
+
+int next_fast_n(int n) {
+  return next_power_of_2(n);
+}
+
+std::vector<int> plan_stockham_fft(int n) {
+  auto radices = supported_radices();
+  std::vector<int> plan(radices.size(), 0);
+  int orig_n = n;
+  if (n == 1) {
+    return plan;
+  }
+  for (int i = 0; i < radices.size(); i++) {
+    int radix = radices[i];
+    // Manually tuned radices for powers of 2
+    if (is_power_of_2(orig_n) && orig_n < 512 && radix > 4) {
+      continue;
+    }
+    while (n % radix == 0) {
+      plan[i] += 1;
+      n /= radix;
+      if (n == 1) {
+        return plan;
+      }
+    }
+  }
+  throw std::runtime_error("Unplannable");
+}
+
+FFTPlan plan_fft(int n) {
+  auto radices = supported_radices();
+  std::set<int> radices_set(radices.begin(), radices.end());
+
+  FFTPlan plan;
+  plan.n = n;
+  plan.rader = std::vector<int>(radices.size(), 0);
+  auto factors = prime_factors(n);
+  int remaining_n = n;
+
+  // Four Step FFT when N is too large for shared mem.
+  if (n > MAX_STOCKHAM_FFT_SIZE && is_power_of_2(n)) {
+    // For power's of two we have a fast, no transpose four step implementation.
+    plan.four_step = true;
+    // Rough heuristic for choosing faster powers of two when we can
+    plan.n2 = n > 65536 ? 1024 : 64;
+    plan.n1 = n / plan.n2;
+    return plan;
+  } else if (n > MAX_STOCKHAM_FFT_SIZE) {
+    // Otherwise we use a multi-upload Bluestein's
+    plan.four_step = true;
+    plan.bluestein_n = next_fast_n(2 * n - 1);
+    return plan;
  }

-  size_t n = in.shape(axes_[0]);
+  for (int factor : factors) {
+    // Make sure the factor is a supported radix
+    if (radices_set.find(factor) == radices_set.end()) {
+      // We only support a single Rader factor currently
+      // TODO(alexbarron) investigate weirdness with large
+      // Rader sizes -- possibly a compiler issue?
+      if (plan.rader_n > 1 || n > MAX_RADER_FFT_SIZE) {
+        plan.four_step = n > MAX_BLUESTEIN_FFT_SIZE;
+        plan.bluestein_n = next_fast_n(2 * n - 1);
+        plan.stockham = plan_stockham_fft(plan.bluestein_n);
+        plan.rader = std::vector<int>(radices.size(), 0);
+        return plan;
+      }
+      // See if we can use Rader's algorithm to Stockham decompose n - 1
+      auto rader_factors = prime_factors(factor - 1);
+      int last_factor = -1;
+      for (int rf : rader_factors) {
+        // We don't nest Rader's algorithm so if `factor - 1`
+        // isn't Stockham decomposable we give up and do Bluestein's.
+        if (radices_set.find(rf) == radices_set.end()) {
+          plan.four_step = n > MAX_BLUESTEIN_FFT_SIZE;
+          plan.bluestein_n = next_fast_n(2 * n - 1);
+          plan.stockham = plan_stockham_fft(plan.bluestein_n);
+          plan.rader = std::vector<int>(radices.size(), 0);
+          return plan;
+        }
+      }
+      plan.rader = plan_stockham_fft(factor - 1);
+      plan.rader_n = factor;
+      remaining_n /= factor;
+    }
+  }

-  if (!is_power_of_2(n) || n > 2048 || n < 4) {
-    throw std::runtime_error(
-        "GPU FFT is only implemented for the powers of 2 from 4 -> 2048");
+  plan.stockham = plan_stockham_fft(remaining_n);
+  return plan;
+}
+
+int compute_elems_per_thread(FFTPlan plan) {
+  // Heuristics for selecting an efficient number
+  // of threads to use for a particular mixed-radix FFT.
+  auto n = plan.n;
+
+  std::vector<int> steps;
+  auto radices = supported_radices();
+  steps.insert(steps.end(), plan.stockham.begin(), plan.stockham.end());
+  steps.insert(steps.end(), plan.rader.begin(), plan.rader.end());
+  std::set<int> used_radices;
+  for (int i = 0; i < steps.size(); i++) {
+    int radix = radices[i % radices.size()];
+    if (steps[i] > 0) {
+      used_radices.insert(radix);
+    }
+  }
+
+  // Manual tuning for 7/11/13
+  if (used_radices.find(7) != used_radices.end() &&
+      (used_radices.find(11) != used_radices.end() ||
+       used_radices.find(13) != used_radices.end())) {
+    return 7;
+  } else if (
+      used_radices.find(11) != used_radices.end() &&
+      used_radices.find(13) != used_radices.end()) {
+    return 11;
+  }
+
+  // TODO(alexbarron) Some really weird stuff is going on
+  // for certain `elems_per_thread` on large composite n.
+  // Possibly a compiler issue?
+  if (n == 3159)
+    return 13;
+  if (n == 3645)
+    return 5;
+  if (n == 3969)
+    return 7;
+  if (n == 1982)
+    return 5;
+
+  if (used_radices.size() == 1) {
+    return *(used_radices.begin());
+  }
+  if (used_radices.size() == 2) {
+    if (used_radices.find(11) != used_radices.end() ||
+        used_radices.find(13) != used_radices.end()) {
+      return std::accumulate(used_radices.begin(), used_radices.end(), 0) / 2;
+    }
+    std::vector<int> radix_vec(used_radices.begin(), used_radices.end());
+    return radix_vec[1];
+  }
+  // In all other cases use the second smallest radix.
+  std::vector<int> radix_vec(used_radices.begin(), used_radices.end());
+  return radix_vec[1];
+}
+
+// Rader
+int mod_exp(int x, int y, int n) {
+  int out = 1;
+  while (y) {
+    if (y & 1) {
+      out = out * x % n;
+    }
+    y >>= 1;
+    x = x * x % n;
+  }
+  return out;
+}
+
+int primitive_root(int n) {
+  auto factors = prime_factors(n - 1);
+
+  for (int r = 2; r < n - 1; r++) {
+    bool found = true;
+    for (int factor : factors) {
+      if (mod_exp(r, (n - 1) / factor, n) == 1) {
+        found = false;
+        break;
+      }
+    }
+    if (found) {
+      return r;
+    }
+  }
+  return -1;
+}
+
+std::tuple<array, array, array> compute_raders_constants(
+    int rader_n,
+    const Stream& s) {
+  int proot = primitive_root(rader_n);
+  // Fermat's little theorem
+  int inv = mod_exp(proot, rader_n - 2, rader_n);
+  std::vector<short> g_q(rader_n - 1);
+  std::vector<short> g_minus_q(rader_n - 1);
+  for (int i = 0; i < rader_n - 1; i++) {
+    g_q[i] = mod_exp(proot, i, rader_n);
+    g_minus_q[i] = mod_exp(inv, i, rader_n);
+  }
+  array g_q_arr(g_q.begin(), {rader_n - 1});
+  array g_minus_q_arr(g_minus_q.begin(), {rader_n - 1});
+
+  std::vector<std::complex<float>> b_q(rader_n - 1);
+  for (int i = 0; i < rader_n - 1; i++) {
+    float pi_i = (float)g_minus_q[i] * -2.0 * M_PI / rader_n;
+    b_q[i] = std::exp(std::complex<float>(0, pi_i));
+  }
+
+  array b_q_fft({rader_n - 1}, complex64, nullptr, {});
+  b_q_fft.set_data(allocator::malloc_or_wait(b_q_fft.nbytes()));
+  auto b_q_fft_ptr =
+      reinterpret_cast<std::complex<float>*>(b_q_fft.data<complex64_t>());
+  std::ptrdiff_t item_size = b_q_fft.itemsize();
+  size_t fft_size = rader_n - 1;
+  // This FFT is always small (<4096, batch 1) so save some overhead
+  // and do it on the CPU
+  pocketfft::c2c(
+      /* shape= */ {fft_size},
+      /* stride_in= */ {item_size},
+      /* stride_out= */ {item_size},
+      /* axes= */ {0},
+      /* forward= */ true,
+      /* data_in= */ b_q.data(),
+      /* data_out= */ b_q_fft_ptr,
+      /* scale= */ 1.0f);
+  return std::make_tuple(b_q_fft, g_q_arr, g_minus_q_arr);
+}
+
+// Bluestein
+std::pair<array, array> compute_bluestein_constants(int n, int bluestein_n) {
+  // We need to calculate the Bluestein twiddle factors
+  // in double precision for the overall numerical stability
+  // of Bluestein's FFT algorithm to be acceptable.
+  //
+  // Metal doesn't support float64, so instead we
+  // manually implement the required operations on cpu.
+  //
+  // In numpy:
+  // w_k = np.exp(-1j * np.pi / N * (np.arange(-N + 1, N) ** 2))
+  // w_q = np.fft.fft(1/w_k)
+  // return w_k, w_q
+  int length = 2 * n - 1;
+
+  std::vector<std::complex<float>> w_k_vec(n);
+  std::vector<std::complex<float>> w_q_vec(bluestein_n, 0);
+
+  for (int i = -n + 1; i < n; i++) {
+    double theta = pow(i, 2) * M_PI / (double)n;
+    w_q_vec[i + n - 1] = std::exp(std::complex<double>(0, theta));
+    if (i >= 0) {
+      w_k_vec[i] = std::exp(std::complex<double>(0, -theta));
+    }
+  }
+
+  array w_k({n}, complex64, nullptr, {});
+  w_k.set_data(allocator::malloc_or_wait(w_k.nbytes()));
+  std::copy(w_k_vec.begin(), w_k_vec.end(), w_k.data<complex64_t>());
+
+  array w_q({bluestein_n}, complex64, nullptr, {});
+  w_q.set_data(allocator::malloc_or_wait(w_q.nbytes()));
+  auto w_q_ptr =
+      reinterpret_cast<std::complex<float>*>(w_q.data<complex64_t>());
+
+  std::ptrdiff_t item_size = w_q.itemsize();
+  size_t fft_size = bluestein_n;
+  pocketfft::c2c(
+      /* shape= */ {fft_size},
+      /* stride_in= */ {item_size},
+      /* stride_out= */ {item_size},
+      /* axes= */ {0},
+      /* forward= */ true,
+      /* data_in= */ w_q_vec.data(),
+      /* data_out= */ w_q_ptr,
+      /* scale= */ 1.0f);
+  return std::make_tuple(w_k, w_q);
+}
+
+void multi_upload_bluestein_fft(
+    const array& in,
+    array& out,
+    size_t axis,
+    bool inverse,
+    bool real,
+    FFTPlan& plan,
+    std::vector<array> copies,
+    const Stream& s) {
+  // TODO(alexbarron) Implement fused kernels for mutli upload bluestein's
+  // algorithm
+  int n = inverse ? out.shape(axis) : in.shape(axis);
+  auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);
+
+  // Broadcast w_q and w_k to the batch size
+  std::vector<size_t> b_strides(in.ndim(), 0);
+  b_strides[axis] = 1;
+  array w_k_broadcast({}, complex64, nullptr, {});
+  array w_q_broadcast({}, complex64, nullptr, {});
+  w_k_broadcast.copy_shared_buffer(w_k, b_strides, {}, w_k.data_size());
+  w_q_broadcast.copy_shared_buffer(w_q, b_strides, {}, w_q.data_size());
+
+  auto temp_shape = inverse ? out.shape() : in.shape();
+  array temp(temp_shape, complex64, nullptr, {});
+  array temp1(temp_shape, complex64, nullptr, {});
+
+  if (real && !inverse) {
+    // Convert float32->complex64
+    copy_gpu(in, temp, CopyType::General, s);
+  } else if (real && inverse) {
+    int back_offset = n % 2 == 0 ? 2 : 1;
+    auto slice_shape = in.shape();
+    slice_shape[axis] -= back_offset;
+    array slice_temp(slice_shape, complex64, nullptr, {});
+    array conj_temp(in.shape(), complex64, nullptr, {});
+    copies.push_back(slice_temp);
+    copies.push_back(conj_temp);
+
+    std::vector<int> rstarts(in.ndim(), 0);
+    std::vector<int> rstrides(in.ndim(), 1);
+    rstarts[axis] = in.shape(axis) - back_offset;
+    rstrides[axis] = -1;
+    unary_op_gpu({in}, conj_temp, "Conjugate", s);
+    slice_gpu(in, slice_temp, rstarts, rstrides, s);
+    concatenate_gpu({conj_temp, slice_temp}, temp, (int)axis, s);
+  } else if (inverse) {
+    unary_op_gpu({in}, temp, "Conjugate", s);
+  } else {
+    temp.copy_shared_buffer(in);
+  }
+
+  binary_op_gpu({temp, w_k_broadcast}, temp1, "Multiply", s);
+
+  std::vector<std::pair<int, int>> pads;
+  auto padded_shape = out.shape();
+  padded_shape[axis] = plan.bluestein_n;
+  array pad_temp(padded_shape, complex64, nullptr, {});
+  pad_gpu(temp1, array(complex64_t{0.0f, 0.0f}), pad_temp, {(int)axis}, {0}, s);
+
+  array pad_temp1(padded_shape, complex64, nullptr, {});
+  fft_op(
+      pad_temp,
+      pad_temp1,
+      axis,
+      /*inverse=*/false,
+      /*real=*/false,
+      FourStepParams(),
+      /*inplace=*/false,
+      s);
+
+  binary_op_gpu_inplace({pad_temp1, w_q_broadcast}, pad_temp, "Multiply", s);
+
+  fft_op(
+      pad_temp,
+      pad_temp1,
+      axis,
+      /* inverse= */ true,
+      /* real= */ false,
+      FourStepParams(),
+      /*inplace=*/true,
+      s);
+
+  int offset = plan.bluestein_n - (2 * n - 1);
+  std::vector<int> starts(in.ndim(), 0);
+  std::vector<int> strides(in.ndim(), 1);
+  starts[axis] = plan.bluestein_n - offset - n;
+  slice_gpu(pad_temp1, temp, starts, strides, s);
+
+  binary_op_gpu_inplace({temp, w_k_broadcast}, temp1, "Multiply", s);
+
+  if (real && !inverse) {
+    std::vector<int> rstarts(in.ndim(), 0);
+    std::vector<int> rstrides(in.ndim(), 1);
+    slice_gpu(temp1, out, rstarts, strides, s);
+  } else if (real && inverse) {
+    std::vector<size_t> b_strides(in.ndim(), 0);
+    auto inv_n = array({1.0f / n}, {1}, float32);
+    array temp_float(out.shape(), out.dtype(), nullptr, {});
+    copies.push_back(temp_float);
+    copies.push_back(inv_n);
+
+    copy_gpu(temp1, temp_float, CopyType::General, s);
+    binary_op_gpu({temp_float, inv_n}, out, "Multiply", s);
+  } else if (inverse) {
+    auto inv_n = array({1.0f / n}, {1}, complex64);
+    unary_op_gpu({temp1}, temp, "Conjugate", s);
+    binary_op_gpu({temp, inv_n}, out, "Multiply", s);
+    copies.push_back(inv_n);
+  } else {
+    out.copy_shared_buffer(temp1);
+  }
+
+  copies.push_back(w_k);
+  copies.push_back(w_q);
+  copies.push_back(w_k_broadcast);
+  copies.push_back(w_q_broadcast);
+  copies.push_back(temp);
+  copies.push_back(temp1);
+  copies.push_back(pad_temp);
+  copies.push_back(pad_temp1);
+}
+
+void four_step_fft(
+    const array& in,
+    array& out,
+    size_t axis,
+    bool inverse,
+    bool real,
+    FFTPlan& plan,
+    std::vector<array> copies,
+    const Stream& s) {
+  auto& d = metal::device(s.device);
+
+  if (plan.bluestein_n == -1) {
+    // Fast no transpose implementation for powers of 2.
+    FourStepParams four_step_params = {
+        /* required= */ true, /* first_step= */ true, plan.n1, plan.n2};
+    auto temp_shape = (real && inverse) ? out.shape() : in.shape();
+    array temp(temp_shape, complex64, nullptr, {});
+    fft_op(
+        in, temp, axis, inverse, real, four_step_params, /*inplace=*/false, s);
+    four_step_params.first_step = false;
+    fft_op(
+        temp, out, axis, inverse, real, four_step_params, /*inplace=*/false, s);
+    copies.push_back(temp);
+  } else {
+    multi_upload_bluestein_fft(in, out, axis, inverse, real, plan, copies, s);
+  }
+}
+
+void fft_op(
+    const array& in,
+    array& out,
+    size_t axis,
+    bool inverse,
+    bool real,
+    const FourStepParams four_step_params,
+    bool inplace,
+    const Stream& s) {
+  auto& d = metal::device(s.device);
+
+  size_t n = out.dtype() == float32 ? out.shape(axis) : in.shape(axis);
+  if (n == 1) {
+    out.copy_shared_buffer(in);
+    return;
+  }
+
+  if (four_step_params.required) {
+    // Four Step FFT decomposes into two FFTs: n1 on columns, n2 on rows
+    n = four_step_params.first_step ? four_step_params.n1 : four_step_params.n2;
  }

  // Make sure that the array is contiguous and has stride 1 in the FFT dim
  std::vector<array> copies;
-  auto check_input = [this, &copies, &s](const array& x) {
+  auto check_input = [&axis, &copies, &s](const array& x) {
    // TODO: Pass the strides to the kernel so
    // we can avoid the copy when x is not contiguous.
-    bool no_copy = x.strides()[axes_[0]] == 1 && x.flags().row_contiguous ||
-        x.flags().col_contiguous;
+    bool no_copy = x.strides()[axis] == 1 &&
+        (x.flags().row_contiguous || x.flags().col_contiguous);
    if (no_copy) {
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
      std::vector<size_t> strides;
-      size_t cur_stride = x.shape(axes_[0]);
-      for (int axis = 0; axis < x.ndim(); axis++) {
-        if (axis == axes_[0]) {
+      size_t cur_stride = x.shape(axis);
+      for (int a = 0; a < x.ndim(); a++) {
+        if (a == axis) {
          strides.push_back(1);
        } else {
          strides.push_back(cur_stride);
-          cur_stride *= x.shape(axis);
+          cur_stride *= x.shape(a);
        }
      }

      auto flags = x.flags();
-      size_t f_stride = 1;
-      size_t b_stride = 1;
-      flags.col_contiguous = true;
-      flags.row_contiguous = true;
-      for (int i = 0, ri = x.ndim() - 1; i < x.ndim(); ++i, --ri) {
-        flags.col_contiguous &= (strides[i] == f_stride || x.shape(i) == 1);
-        f_stride *= x.shape(i);
-        flags.row_contiguous &= (strides[ri] == b_stride || x.shape(ri) == 1);
-        b_stride *= x.shape(ri);
-      }
-      // This is probably over-conservative
-      flags.contiguous = false;
+      auto [data_size, is_row_contiguous, is_col_contiguous] =
+          check_contiguity(x.shape(), strides);
+
+      flags.col_contiguous = is_row_contiguous;
+      flags.row_contiguous = is_col_contiguous;
+      flags.contiguous = data_size == x_copy.size();

      x_copy.set_data(
-          allocator::malloc_or_wait(x.nbytes()), x.data_size(), strides, flags);
+          allocator::malloc_or_wait(x.nbytes()), data_size, strides, flags);
      copy_gpu_inplace(x, x_copy, CopyType::GeneralGeneral, s);
      copies.push_back(x_copy);
      return x_copy;
    }
  };
-  const array& in_contiguous = check_input(inputs[0]);
+  const array& in_contiguous = check_input(in);
+
+  // real to complex: n -> (n/2)+1
+  // complex to real: (n/2)+1 -> n
+  auto out_strides = in_contiguous.strides();
+  size_t out_data_size = in_contiguous.data_size();
+  if (in.shape(axis) != out.shape(axis)) {
+    for (int i = 0; i < out_strides.size(); i++) {
+      if (out_strides[i] != 1) {
+        out_strides[i] = out_strides[i] / in.shape(axis) * out.shape(axis);
+      }
+    }
+    out_data_size = out_data_size / in.shape(axis) * out.shape(axis);
+  }
+
+  auto plan = plan_fft(n);
+  if (plan.four_step) {
+    four_step_fft(in, out, axis, inverse, real, plan, copies, s);
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+    return;
+  }

  // TODO: allow donation here
-  out.set_data(
-      allocator::malloc_or_wait(out.nbytes()),
-      in_contiguous.data_size(),
-      in_contiguous.strides(),
-      in_contiguous.flags());
+  if (!inplace) {
+    out.set_data(
+        allocator::malloc_or_wait(out.nbytes()),
+        out_data_size,
+        out_strides,
+        in_contiguous.flags());
+  }

-  // We use n / 4 threads by default since radix-4
-  // is the largest single threaded radix butterfly
-  // we currently implement.
-  size_t m = n / 4;
-  size_t batch = in.size() / in.shape(axes_[0]);
+  auto radices = supported_radices();
+  int fft_size = plan.bluestein_n > 0 ? plan.bluestein_n : n;
+
+  // Setup function constants
+  bool power_of_2 = is_power_of_2(fft_size);
+
+  auto make_int = [](int* a, int i) {
+    return std::make_tuple(a, MTL::DataType::DataTypeInt, i);
+  };
+  auto make_bool = [](bool* a, int i) {
+    return std::make_tuple(a, MTL::DataType::DataTypeBool, i);
+  };
+
+  std::vector<MTLFC> func_consts = {
+      make_bool(&inverse, 0), make_bool(&power_of_2, 1)};
+
+  // Start of radix/rader step constants
+  int index = 4;
+  for (int i = 0; i < plan.stockham.size(); i++) {
+    func_consts.push_back(make_int(&plan.stockham[i], index));
+    index += 1;
+  }
+  for (int i = 0; i < plan.rader.size(); i++) {
+    func_consts.push_back(make_int(&plan.rader[i], index));
+    index += 1;
+  }
+  int elems_per_thread = compute_elems_per_thread(plan);
+  func_consts.push_back(make_int(&elems_per_thread, 2));
+
+  int rader_m = n / plan.rader_n;
+  func_consts.push_back(make_int(&rader_m, 3));
+
+  // The overall number of FFTs we're going to compute for this input
+  int size = out.dtype() == float32 ? out.size() : in.size();
+  if (real && inverse && four_step_params.required) {
+    size = out.size();
+  }
+  int total_batch_size = size / n;
+  int threads_per_fft = (fft_size + elems_per_thread - 1) / elems_per_thread;
+
+  // We batch among threadgroups for improved efficiency when n is small
+  int threadgroup_batch_size = std::max(MIN_THREADGROUP_MEM_SIZE / fft_size, 1);
+  if (four_step_params.required) {
+    // Require a threadgroup batch size of at least 4 for four step FFT
+    // so we can coalesce the memory accesses.
+    threadgroup_batch_size =
+        std::max(threadgroup_batch_size, MIN_COALESCE_WIDTH);
+  }
+  int threadgroup_mem_size = next_power_of_2(threadgroup_batch_size * fft_size);
+  // FFTs up to 2^20 are currently supported
+  assert(threadgroup_mem_size <= MAX_STOCKHAM_FFT_SIZE);
+
+  // ceil divide
+  int batch_size =
+      (total_batch_size + threadgroup_batch_size - 1) / threadgroup_batch_size;
+
+  if (real && !four_step_params.required) {
+    // We can perform 2 RFFTs at once so the batch size is halved.
+    batch_size = (batch_size + 2 - 1) / 2;
+  }
+  int out_buffer_size = out.size();

  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto in_type_str = in.dtype() == float32 ? "float" : "float2";
+  auto out_type_str = out.dtype() == float32 ? "float" : "float2";
+  // Only required by four step
+  int step = -1;
  {
    std::ostringstream kname;
-    kname << "fft_" << n;
-    auto kernel = d.get_kernel(kname.str());
+    std::string inv_string = inverse ? "true" : "false";
+    std::string real_string = real ? "true" : "false";
+    std::string func_name;
+    if (plan.bluestein_n > 0) {
+      kname << "bluestein_fft_mem_" << threadgroup_mem_size << "_"
+            << in_type_str << "_" << out_type_str;
+      func_name = "bluestein_fft";
+    } else if (plan.rader_n > 1) {
+      kname << "rader_fft_mem_" << threadgroup_mem_size << "_" << in_type_str
+            << "_" << out_type_str;
+      func_name = "rader_fft";
+    } else if (four_step_params.required) {
+      step = four_step_params.first_step ? 0 : 1;
+      kname << "four_step_mem_" << threadgroup_mem_size << "_" << in_type_str
+            << "_" << out_type_str << "_" << step << "_" << real_string;
+      func_name = "four_step_fft";
+    } else {
+      kname << "fft_mem_" << threadgroup_mem_size << "_" << in_type_str << "_"
+            << out_type_str;
+      func_name = "fft";
+    }
+    std::string base_name = kname.str();
+    // We use a specialized kernel for each FFT size
+    kname << "_n" << fft_size << "_inv_" << inverse;
+    std::string hash_name = kname.str();
+    auto template_def = func_name == "four_step_fft" ? get_template_definition(
+                                                           base_name,
+                                                           func_name,
+                                                           threadgroup_mem_size,
+                                                           in_type_str,
+                                                           out_type_str,
+                                                           step,
+                                                           real)
+                                                     : get_template_definition(
+                                                           base_name,
+                                                           func_name,
+                                                           threadgroup_mem_size,
+                                                           in_type_str,
+                                                           out_type_str);
+    auto kernel =
+        get_fft_kernel(d, base_name, hash_name, func_consts, template_def);

-    bool donated = in.data_shared_ptr() == nullptr;
    compute_encoder->setComputePipelineState(kernel);
    compute_encoder.set_input_array(in_contiguous, 0);
    compute_encoder.set_output_array(out, 1);

-    auto group_dims = MTL::Size(1, m, 1);
-    auto grid_dims = MTL::Size(batch, m, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    if (plan.bluestein_n > 0) {
+      // Precomputed twiddle factors for Bluestein's
+      auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);
+      copies.push_back(w_q);
+      copies.push_back(w_k);
+
+      compute_encoder.set_input_array(w_q, 2); // w_q
+      compute_encoder.set_input_array(w_k, 3); // w_k
+      compute_encoder->setBytes(&n, sizeof(int), 4);
+      compute_encoder->setBytes(&plan.bluestein_n, sizeof(int), 5);
+      compute_encoder->setBytes(&total_batch_size, sizeof(int), 6);
+    } else if (plan.rader_n > 1) {
+      auto [b_q, g_q, g_minus_q] = compute_raders_constants(plan.rader_n, s);
+      copies.push_back(b_q);
+      copies.push_back(g_q);
+      copies.push_back(g_minus_q);
+
+      compute_encoder.set_input_array(b_q, 2);
+      compute_encoder.set_input_array(g_q, 3);
+      compute_encoder.set_input_array(g_minus_q, 4);
+      compute_encoder->setBytes(&n, sizeof(int), 5);
+      compute_encoder->setBytes(&total_batch_size, sizeof(int), 6);
+      compute_encoder->setBytes(&plan.rader_n, sizeof(int), 7);
+    } else if (four_step_params.required) {
+      compute_encoder->setBytes(&four_step_params.n1, sizeof(int), 2);
+      compute_encoder->setBytes(&four_step_params.n2, sizeof(int), 3);
+      compute_encoder->setBytes(&total_batch_size, sizeof(int), 4);
+    } else {
+      compute_encoder->setBytes(&n, sizeof(int), 2);
+      compute_encoder->setBytes(&total_batch_size, sizeof(int), 3);
+    }
+
+    auto group_dims = MTL::Size(1, threadgroup_batch_size, threads_per_fft);
+    auto grid_dims =
+        MTL::Size(batch_size, threadgroup_batch_size, threads_per_fft);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
  }
  d.get_command_buffer(s.index)->addCompletedHandler(
      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
 }

+void fft_op(
+    const array& in,
+    array& out,
+    size_t axis,
+    bool inverse,
+    bool real,
+    bool inplace,
+    const Stream& s) {
+  fft_op(in, out, axis, inverse, real, FourStepParams(), inplace, s);
+}
+
+void nd_fft_op(
+    const array& in,
+    array& out,
+    const std::vector<size_t>& axes,
+    bool inverse,
+    bool real,
+    const Stream& s) {
+  // Perform ND FFT on GPU as a series of 1D FFTs
+  auto temp_shape = inverse ? in.shape() : out.shape();
+  array temp1(temp_shape, complex64, nullptr, {});
+  array temp2(temp_shape, complex64, nullptr, {});
+  std::vector<array> temp_arrs = {temp1, temp2};
+  for (int i = axes.size() - 1; i >= 0; i--) {
+    int reverse_index = axes.size() - i - 1;
+    // For 5D and above, we don't want to reallocate our two temporary arrays
+    bool inplace = reverse_index >= 3 && i != 0;
+    // Opposite order for fft vs ifft
+    int index = inverse ? reverse_index : i;
+    size_t axis = axes[index];
+    // Mirror np.fft.(i)rfftn and perform a real transform
+    // only on the final axis.
+    bool step_real = (real && index == axes.size() - 1);
+    int step_shape = inverse ? out.shape(axis) : in.shape(axis);
+    const array& in_arr = i == axes.size() - 1 ? in : temp_arrs[1 - i % 2];
+    array& out_arr = i == 0 ? out : temp_arrs[i % 2];
+    fft_op(in_arr, out_arr, axis, inverse, step_real, inplace, s);
+  }
+
+  std::vector<array> copies = {temp1, temp2};
+  auto& d = metal::device(s.device);
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+}
+
+void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& s = stream();
+  auto& in = inputs[0];
+
+  if (axes_.size() > 1) {
+    nd_fft_op(in, out, axes_, inverse_, real_, s);
+  } else {
+    fft_op(in, out, axes_[0], inverse_, real_, /*inplace=*/false, s);
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/hadamard.cpp
+++ b/mlx/backend/metal/hadamard.cpp
@@ -0,0 +1,203 @@
+// Copyright © 2024 Apple Inc.
+
+#include <map>
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/common/hadamard.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/jit/includes.h"
+#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+constexpr int MAX_HADAMARD_THREADS_PER_GROUP = 256;
+constexpr int MAX_HADAMARD_BYTES = 32768; // 32KB
+
+std::string gen_hadamard_codelet(int m) {
+  // Generate a O(m^2) hadamard codelet for a given M
+  // using the hadamard matrices above
+  //
+  // e.g. m = 2
+  // METAL_FUNC void hadamard_m(thread float *x) {
+  //   float tmp[2];
+  //   tmp[0] = + x[0] + x[1];
+  //   tmp[1] = + x[0] - x[1];
+  //   for (int i = 0; i < 2; i++) { x[i] = tmp[i]; }
+  // }
+  //
+  auto h_matrices = hadamard_matrices();
+  auto& matrix = h_matrices[m];
+
+  std::ostringstream source;
+  source << "METAL_FUNC void hadamard_radix_m(thread float *x) {" << std::endl;
+  if (m == 1) {
+    source << "}" << std::endl;
+    return source.str();
+  }
+  source << "  float tmp[" << m << "];" << std::endl;
+  auto start = 1;
+  auto end = matrix.find('\n', start);
+
+  int index = 0;
+  while (end != std::string_view::npos) {
+    source << "  tmp[" << index << "] = ";
+    auto row = matrix.substr(start, end - start);
+    for (int i = 0; i < row.length(); i++) {
+      source << " " << row[i] << " x[" << i << "]";
+    }
+    source << ";" << std::endl;
+    start = end + 1;
+    end = matrix.find('\n', start);
+    index++;
+  }
+  source << "  for (int i = 0; i < " << m << "; i++) { x[i] = tmp[i]; }"
+         << std::endl;
+  source << "}" << std::endl;
+  return source.str();
+}
+
+void launch_hadamard(
+    const array& in,
+    array& out,
+    int batch_size,
+    int threads_per,
+    const std::string kernel_name,
+    float scale,
+    const Stream& s) {
+  auto& d = metal::device(s.device);
+
+  const auto& lib_name = kernel_name.substr(1);
+  auto lib = d.get_library(lib_name);
+  auto kernel = d.get_kernel(kernel_name, lib);
+  assert(threads_per <= kernel->maxTotalThreadsPerThreadgroup());
+
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_input_array(in, 0);
+  compute_encoder.set_output_array(out, 1);
+  compute_encoder->setBytes(&scale, sizeof(float), 2);
+
+  MTL::Size group_dims = MTL::Size(1, threads_per, 1);
+  MTL::Size grid_dims = MTL::Size(batch_size, threads_per, 1);
+  compute_encoder->dispatchThreads(grid_dims, group_dims);
+}
+
+void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& s = stream();
+
+  auto& in = inputs[0];
+
+  std::vector<array> copies;
+  // Only support the last axis for now
+  int axis = in.ndim() - 1;
+  auto check_input = [&copies, &s](const array& x) {
+    // TODO(alexbarron) pass strides to kernel to relax this constraint
+    bool no_copy = x.flags().row_contiguous;
+    if (no_copy) {
+      return x;
+    } else {
+      copies.push_back(array(x.shape(), x.dtype(), nullptr, {}));
+      copy_gpu(x, copies.back(), CopyType::General, s);
+      return copies.back();
+    }
+  };
+  const array& in_contiguous = check_input(in);
+
+  if (in_contiguous.is_donatable()) {
+    out.move_shared_buffer(in_contiguous);
+  } else {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  }
+
+  auto [n, m] = decompose_hadamard(in.shape(axis));
+
+  if (n * (int)size_of(in.dtype()) > MAX_HADAMARD_BYTES) {
+    throw std::invalid_argument(
+        "[hadamard] For n = m*2^k, 2^k > 8192 for FP32 or 2^k > 16384 for FP16/BF16 NYI");
+  }
+
+  int max_radix = std::min(n, 16);
+  // Use read_width 2 for m = 28 to avoid register spilling
+  int read_width = (n == 2 || m == 28) ? 2 : 4;
+
+  std::ostringstream kname;
+  kname << "hadamard_" << n * m << "_" << type_to_name(out);
+  auto kernel_name = kname.str();
+  auto& d = metal::device(s.device);
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    auto codelet = gen_hadamard_codelet(m);
+    kernel_source << metal::utils() << codelet << metal::hadamard();
+    kernel_source << get_template_definition(
+        "n" + kernel_name,
+        "hadamard_n",
+        get_type_string(in.dtype()),
+        n,
+        max_radix,
+        read_width);
+    kernel_source << get_template_definition(
+        "m" + kernel_name,
+        "hadamard_m",
+        get_type_string(in.dtype()),
+        n,
+        m,
+        read_width);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+
+  int batch_size = in.size() / n;
+  int threads_per = n / max_radix;
+
+  if (m > 1) {
+    // When m is greater than 1, we decompose the
+    // computation into two uploads to the GPU:
+    //
+    // e.g. len(x) = 12*4 = 48, m = 12, n = 4
+    //
+    // y = h48 @ x
+    //
+    // Upload 1:
+    // tmp = a.reshape(12, 4) @ h4
+    //
+    // Upload 2:
+    // y = h12 @ tmp
+    array temp(in.shape(), in.dtype(), nullptr, {});
+    temp.set_data(allocator::malloc_or_wait(temp.nbytes()));
+    copies.push_back(temp);
+
+    launch_hadamard(
+        in_contiguous,
+        temp,
+        batch_size,
+        threads_per,
+        "n" + kernel_name,
+        1.0,
+        s);
+
+    // Metal sometimes reports 256 max threads per group for hadamard_m kernel
+    threads_per = std::min(n / read_width, MAX_HADAMARD_THREADS_PER_GROUP);
+    batch_size = in.size() / m / read_width / threads_per;
+    launch_hadamard(
+        temp, out, batch_size, threads_per, "m" + kernel_name, scale_, s);
+  } else {
+    launch_hadamard(
+        in_contiguous,
+        out,
+        batch_size,
+        threads_per,
+        "n" + kernel_name,
+        scale_,
+        s);
+  }
+
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -293,7 +293,18 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
        out.shape().data(), out.shape().size() * sizeof(int), 3);
    compute_encoder->setBytes(
        out.strides().data(), out.strides().size() * sizeof(size_t), 4);
-    compute_encoder->setBytes(&upd_size, sizeof(size_t), 5);
+
+    size_t out_ndim = out.ndim();
+    compute_encoder->setBytes(&out_ndim, sizeof(out_ndim), 5);
+    if (upd_ndim <= 1) {
+      // Placeholder so Metal doesn't compalain
+      int shape_ = 0;
+      compute_encoder->setBytes(&shape_, sizeof(int), 6);
+    } else {
+      compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 6);
+    }
+    compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 7);
+    compute_encoder->setBytes(&upd_size, sizeof(size_t), 8);

    // Set index buffers
    for (int i = 0; i < nidx; ++i) {
--- a/mlx/backend/metal/jit/binary.h
+++ b/mlx/backend/metal/jit/binary.h
@@ -1,87 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view binary_kernels = R"(
-template [[host_name("ss{0}")]] [[kernel]]
-void binary_ss<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("vs{0}")]] [[kernel]]
-void binary_vs<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("sv{0}")]] [[kernel]]
-void binary_sv<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("vv{0}")]] [[kernel]]
-void binary_vv<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("g4{0}")]] [[kernel]] void
-binary_g_nd<{1}, {2}, {3}, 4>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const int shape[4],
-    constant const size_t a_strides[4],
-    constant const size_t b_strides[4],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("g5{0}")]] [[kernel]] void
-binary_g_nd<{1}, {2}, {3}, 5>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const int shape[5],
-    constant const size_t a_strides[5],
-    constant const size_t b_strides[5],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-
-template [[host_name("g1{0}")]] [[kernel]] void
-binary_g_nd1<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("g2{0}")]] [[kernel]] void
-binary_g_nd2<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]);
-template [[host_name("g3{0}")]] [[kernel]] void
-binary_g_nd3<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-
-template [[host_name("gn{0}")]] [[kernel]]
-void binary_g<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
-    constant const int& ndim,
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-)";
--- a/mlx/backend/metal/jit/binary_two.h
+++ b/mlx/backend/metal/jit/binary_two.h
@@ -1,98 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view binary_two_kernels = R"(
-template [[host_name("ss{0}")]] [[kernel]]
-void binary_ss<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("vs{0}")]] [[kernel]]
-void binary_vs<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("sv{0}")]] [[kernel]]
-void binary_sv<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("vv{0}")]] [[kernel]]
-void binary_vv<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    uint index [[thread_position_in_grid]]);
-
-template [[host_name("g4{0}")]] [[kernel]] void
-binary_g_nd<{1}, {2}, {3}, 4>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const int shape[4],
-    constant const size_t a_strides[4],
-    constant const size_t b_strides[4],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("g5{0}")]] [[kernel]] void
-binary_g_nd<{1}, {2}, {3}, 5>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const int shape[5],
-    constant const size_t a_strides[5],
-    constant const size_t b_strides[5],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-
-template [[host_name("g1{0}")]] [[kernel]] void
-binary_g_nd1<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("g2{0}")]] [[kernel]] void
-binary_g_nd2<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]);
-template [[host_name("g3{0}")]] [[kernel]] void
-binary_g_nd3<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-
-template [[host_name("gn{0}")]] [[kernel]]
-void binary_g<{1}, {2}, {3}>(
-    device const {1}* a,
-    device const {1}* b,
-    device {2}* c,
-    device {2}* d,
-    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
-    constant const int& ndim,
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-)";
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -17,6 +17,9 @@ const char* unary();
 const char* binary();
 const char* binary_two();
 const char* copy();
+const char* fft();
+const char* hadamard();
+const char* quantized();
 const char* ternary();
 const char* scan();
 const char* softmax();
--- a/mlx/backend/metal/jit/indexing.h
+++ b/mlx/backend/metal/jit/indexing.h
@@ -38,12 +38,24 @@ constexpr std::string_view scatter_kernels = R"(
    device mlx_atomic<{1}>* out [[buffer(2)]],
    const constant int* out_shape [[buffer(3)]],
    const constant size_t* out_strides [[buffer(4)]],
-    const constant size_t& upd_size [[buffer(5)]],
+    const constant size_t& out_ndim [[buffer(5)]],
+    const constant int* upd_shape [[buffer(6)]],
+    const constant size_t& upd_ndim [[buffer(7)]],
+    const constant size_t& upd_size [[buffer(8)]],
    {5}
    uint2 gid [[thread_position_in_grid]]) {{
  const array<const device {2}*, {4}> idx_buffers = {{ {6} }};
  return scatter_1d_index_impl<{1}, {2}, {3}, {4}>(
-      updates, out, out_shape, out_strides, upd_size, idx_buffers, gid);
+      updates,
+      out,
+      out_shape,
+      out_strides,
+      out_ndim,
+      upd_shape,
+      upd_ndim,
+      upd_size,
+      idx_buffers,
+      gid);
 }}

 [[kernel]] void scatter{0}_{4}(
--- a/mlx/backend/metal/jit/sort.h
+++ b/mlx/backend/metal/jit/sort.h
@@ -1,81 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view block_sort_kernels = R"(
-template [[host_name("carg_{0}")]] [[kernel]] void
-block_sort<{1}, {2}, true, {3}, {4}>(
-    const device {1}* inp [[buffer(0)]],
-    device {2}* out [[buffer(1)]],
-    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& stride_segment_axis [[buffer(4)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-template [[host_name("ncarg_{0}")]] [[kernel]] void
-block_sort_nc<{1}, {2}, true, {3}, {4}>(
-    const device {1}* inp [[buffer(0)]],
-    device {2}* out [[buffer(1)]],
-    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& nc_dim [[buffer(4)]],
-    const device int* nc_shape [[buffer(5)]],
-    const device size_t* nc_strides [[buffer(6)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-template [[host_name("c_{0}")]] [[kernel]] void
-block_sort<{1}, {2}, false, {3}, {4}>(
-    const device {1}* inp [[buffer(0)]],
-    device {2}* out [[buffer(1)]],
-    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& stride_segment_axis [[buffer(4)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-template [[host_name("nc_{0}")]] [[kernel]] void
-block_sort_nc<{1}, {2}, false, {3}, {4}>(
-    const device {1}* inp [[buffer(0)]],
-    device {2}* out [[buffer(1)]],
-    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& nc_dim [[buffer(4)]],
-    const device int* nc_shape [[buffer(5)]],
-    const device size_t* nc_strides [[buffer(6)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-)";
-
-constexpr std::string_view multiblock_sort_kernels = R"(
-template [[host_name("sort_{0}")]] [[kernel]] void
-mb_block_sort<{1}, {2}, true, {3}, {4}>(
-    const device {1}* inp [[buffer(0)]],
-    device {1}* out_vals [[buffer(1)]],
-    device {2}* out_idxs [[buffer(2)]],
-    const constant int& size_sorted_axis [[buffer(3)]],
-    const constant int& stride_sorted_axis [[buffer(4)]],
-    const constant int& nc_dim [[buffer(5)]],
-    const device int* nc_shape [[buffer(6)]],
-    const device size_t* nc_strides [[buffer(7)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-template [[host_name("partition_{0}")]] [[kernel]] void
-mb_block_partition<{1}, {2}, true, {3}, {4}>(
-    device {2}* block_partitions [[buffer(0)]],
-    const device {1}* dev_vals [[buffer(1)]],
-    const device {2}* dev_idxs [[buffer(2)]],
-    const constant int& size_sorted_axis [[buffer(3)]],
-    const constant int& merge_tiles [[buffer(4)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]],
-    uint3 tgp_dims [[threads_per_threadgroup]]);
-template [[host_name("merge_{0}")]] [[kernel]] void
-mb_block_merge<{1}, {2}, true, {3}, {4}>(
-    const device {2}* block_partitions [[buffer(0)]],
-    const device {1}* dev_vals_in [[buffer(1)]],
-    const device {2}* dev_idxs_in [[buffer(2)]],
-    device {1}* dev_vals_out [[buffer(3)]],
-    device {2}* dev_idxs_out [[buffer(4)]],
-    const constant int& size_sorted_axis [[buffer(5)]],
-    const constant int& merge_tiles [[buffer(6)]],
-    const constant int& num_tiles [[buffer(7)]],
-    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]);
-)";
--- a/mlx/backend/metal/jit/ternary.h
+++ b/mlx/backend/metal/jit/ternary.h
@@ -1,80 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view ternary_kernels = R"(
-template [[host_name("v_{0}")]] [[kernel]] void ternary_v<{1}, {2}>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    uint index [[thread_position_in_grid]]);
-
-template [[host_name("g_{0}")]] [[kernel]] void ternary_g<{1}, {2}>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
-    constant const size_t* c_strides,
-    constant const int& ndim,
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-
-template [[host_name("g1_{0}")]] [[kernel]] void
-ternary_g_nd1<{1}, {2}>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const size_t& a_strides,
-    constant const size_t& b_strides,
-    constant const size_t& c_strides,
-    uint index [[thread_position_in_grid]]);
-template [[host_name("g2_{0}")]] [[kernel]] void
-ternary_g_nd2<{1}, {2}>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
-    constant const size_t c_strides[2],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]);
-template [[host_name("g3_{0}")]] [[kernel]] void
-ternary_g_nd3<{1}, {2}>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
-    constant const size_t c_strides[3],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("g4_{0}")]] [[kernel]] void
-ternary_g_nd<{1}, {2}, 4>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const int shape[4],
-    constant const size_t a_strides[4],
-    constant const size_t b_strides[4],
-    constant const size_t c_strides[4],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("g5_{0}")]] [[kernel]] void
-ternary_g_nd<{1}, {2}, 5>(
-    device const bool* a,
-    device const {1}* b,
-    device const {1}* c,
-    device {1}* d,
-    constant const int shape[5],
-    constant const size_t a_strides[5],
-    constant const size_t b_strides[5],
-    constant const size_t c_strides[5],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-)";
--- a/mlx/backend/metal/jit/unary.h
+++ b/mlx/backend/metal/jit/unary.h
@@ -1,16 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view unary_kernels = R"(
-template [[host_name("v{0}")]] [[kernel]] void unary_v<{1}, {2}>(
-    device const {1}* in,
-    device {1}* out,
-    uint index [[thread_position_in_grid]]);
-
-template [[host_name("g{0}")]] [[kernel]] void unary_g<{1}, {2}>(
-    device const {1}* in,
-    device {1}* out,
-    device const int* in_shape,
-    device const size_t* in_strides,
-    device const int& ndim,
-    uint index [[thread_position_in_grid]]);
-)";
--- a/mlx/backend/metal/jit_kernels.cpp
+++ b/mlx/backend/metal/jit_kernels.cpp
@@ -1,20 +1,15 @@
 // Copyright © 2024 Apple Inc.
-#include <fmt/format.h>
+#include <map>

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/metal/jit/arange.h"
-#include "mlx/backend/metal/jit/binary.h"
-#include "mlx/backend/metal/jit/binary_two.h"
 #include "mlx/backend/metal/jit/copy.h"
 #include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/jit/reduce.h"
 #include "mlx/backend/metal/jit/scan.h"
 #include "mlx/backend/metal/jit/softmax.h"
-#include "mlx/backend/metal/jit/sort.h"
 #include "mlx/backend/metal/jit/steel_conv.h"
 #include "mlx/backend/metal/jit/steel_gemm.h"
-#include "mlx/backend/metal/jit/ternary.h"
-#include "mlx/backend/metal/jit/unary.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"

@@ -47,38 +42,76 @@ MTL::ComputePipelineState* get_arange_kernel(
 MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& out) {
+    Dtype out_type,
+    const std::string op) {
  std::string lib_name = kernel_name.substr(1);
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
+    auto u_def = get_template_definition(
+        "v" + lib_name, "unary_v", get_type_string(out_type), op);
+    auto g_def = get_template_definition(
+        "g" + lib_name, "unary_g", get_type_string(out_type), op);
    kernel_source << metal::utils() << metal::unary_ops() << metal::unary()
-                  << fmt::format(
-                         unary_kernels,
-                         lib_name,
-                         get_type_string(out.dtype()),
-                         op_name(out));
+                  << u_def << g_def;
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
 }

+void add_binary_kernels(
+    const std::string lib_name,
+    Dtype in_type,
+    Dtype out_type,
+    const std::string op,
+    std::ostringstream& kernel_source) {
+  const std::map<std::string, std::string> kernel_types = {
+      {"ss", "binary_ss"},
+      {"vs", "binary_vs"},
+      {"sv", "binary_sv"},
+      {"vv", "binary_vv"},
+      {"g1", "binary_g_nd1"},
+      {"g2", "binary_g_nd2"},
+      {"g3", "binary_g_nd3"},
+      {"g4", "binary_g_nd"},
+      {"g5", "binary_g_nd"},
+      {"gn", "binary_g"},
+  };
+  for (auto [name, func] : kernel_types) {
+    std::string template_def;
+    if (name == "g4" || name == "g5") {
+      int dim = std::stoi(name.substr(1));
+      template_def = get_template_definition(
+          name + lib_name,
+          func,
+          get_type_string(in_type),
+          get_type_string(out_type),
+          op,
+          dim);
+    } else {
+      template_def = get_template_definition(
+          name + lib_name,
+          func,
+          get_type_string(in_type),
+          get_type_string(out_type),
+          op);
+    }
+    kernel_source << template_def;
+  }
+}
+
 MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& in,
-    const array& out) {
+    Dtype in_type,
+    Dtype out_type,
+    const std::string op) {
  std::string lib_name = kernel_name.substr(2);
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::binary_ops() << metal::binary()
-                  << fmt::format(
-                         binary_kernels,
-                         lib_name,
-                         get_type_string(in.dtype()),
-                         get_type_string(out.dtype()),
-                         op_name(out));
+    kernel_source << metal::utils() << metal::binary_ops() << metal::binary();
+    add_binary_kernels(lib_name, in_type, out_type, op, kernel_source);
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
@@ -87,20 +120,16 @@ MTL::ComputePipelineState* get_binary_kernel(
 MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& in,
-    const array& out) {
+    Dtype in_type,
+    Dtype out_type,
+    const std::string op) {
  std::string lib_name = kernel_name.substr(2);
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::binary_ops()
-                  << metal::binary_two()
-                  << fmt::format(
-                         binary_two_kernels,
-                         lib_name,
-                         get_type_string(in.dtype()),
-                         get_type_string(out.dtype()),
-                         op_name(out));
+                  << metal::binary_two();
+    add_binary_kernels(lib_name, in_type, out_type, op, kernel_source);
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
@@ -109,17 +138,34 @@ MTL::ComputePipelineState* get_binary_two_kernel(
 MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& out) {
+    Dtype type,
+    const std::string op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::ternary_ops() << metal::ternary()
-                  << fmt::format(
-                         ternary_kernels,
-                         lib_name,
-                         get_type_string(out.dtype()),
-                         op_name(out));
+    const std::map<std::string, std::string> kernel_types = {
+        {"v", "ternary_v"},
+        {"g", "ternary_g"},
+        {"g1", "ternary_g_nd1"},
+        {"g2", "ternary_g_nd2"},
+        {"g3", "ternary_g_nd3"},
+        {"g4", "ternary_g_nd"},
+        {"g5", "ternary_g_nd"},
+    };
+    kernel_source << metal::utils() << metal::ternary_ops() << metal::ternary();
+    for (auto [name, func] : kernel_types) {
+      std::string template_def;
+      if (name == "g4" || name == "g5") {
+        int dim = std::stoi(name.substr(1));
+        template_def = get_template_definition(
+            name + "_" + lib_name, func, get_type_string(type), op, dim);
+      } else {
+        template_def = get_template_definition(
+            name + "_" + lib_name, func, get_type_string(type), op);
+      }
+      kernel_source << template_def;
+    }
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
@@ -170,11 +216,14 @@ MTL::ComputePipelineState* get_scan_kernel(
    const std::string& kernel_name,
    bool reverse,
    bool inclusive,
+    const std::string& reduce_type,
    const array& in,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
+    std::string op_name = "Cum" + reduce_type;
+    op_name[3] = toupper(op_name[3]);
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::scan()
                  << fmt::format(
@@ -182,7 +231,7 @@ MTL::ComputePipelineState* get_scan_kernel(
                         lib_name,
                         get_type_string(in.dtype()),
                         get_type_string(out.dtype()),
-                         op_name(out),
+                         op_name,
                         inclusive,
                         reverse);
    lib = d.get_library(lib_name, kernel_source.str());
@@ -201,14 +250,29 @@ MTL::ComputePipelineState* get_sort_kernel(
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::sort()
-                  << fmt::format(
-                         block_sort_kernels,
-                         lib_name,
-                         get_type_string(in.dtype()),
-                         get_type_string(out.dtype()),
-                         bn,
-                         tn);
+    auto in_type = get_type_string(in.dtype());
+    auto out_type = get_type_string(out.dtype());
+    kernel_source << metal::utils() << metal::sort();
+    for (bool is_argsort : {true, false}) {
+      std::string bool_string = is_argsort ? "true" : "false";
+      std::string func_string = is_argsort ? "carg_" : "c_";
+      kernel_source << get_template_definition(
+          func_string + lib_name,
+          "block_sort",
+          in_type,
+          out_type,
+          bool_string,
+          bn,
+          tn);
+      kernel_source << get_template_definition(
+          "n" + func_string + lib_name,
+          "block_sort_nc",
+          in_type,
+          out_type,
+          bool_string,
+          bn,
+          tn);
+    }
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
@@ -225,14 +289,21 @@ MTL::ComputePipelineState* get_mb_sort_kernel(
  auto lib = d.get_library(lib_name);
  if (lib == nullptr) {
    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::sort()
-                  << fmt::format(
-                         multiblock_sort_kernels,
-                         lib_name,
-                         get_type_string(in.dtype()),
-                         get_type_string(idx.dtype()),
-                         bn,
-                         tn);
+    kernel_source << metal::utils() << metal::sort();
+    std::vector<std::pair<std::string, std::string>> kernel_types = {
+        {"sort_", "mb_block_sort"},
+        {"partition_", "mb_block_partition"},
+        {"merge_", "mb_block_merge"}};
+    for (auto [name, func] : kernel_types) {
+      kernel_source << get_template_definition(
+          name + lib_name,
+          func,
+          get_type_string(in.dtype()),
+          get_type_string(idx.dtype()),
+          "true",
+          bn,
+          tn);
+    }
    lib = d.get_library(lib_name, kernel_source.str());
  }
  return d.get_kernel(kernel_name, lib);
@@ -486,4 +557,36 @@ MTL::ComputePipelineState* get_steel_conv_general_kernel(
  return d.get_kernel(kernel_name, lib);
 }

+MTL::ComputePipelineState* get_fft_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const std::string& template_def) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    std::string kernel_string;
+    kernel_source << metal::fft() << template_def;
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
+}
+
+MTL::ComputePipelineState* get_quantized_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& template_def) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::gemm() << metal::quantized()
+                  << template_def;
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/kernels.h
+++ b/mlx/backend/metal/kernels.h
@@ -1,5 +1,7 @@
 // Copyright © 2024 Apple Inc.

+#include <fmt/format.h>
+
 #include "mlx/array.h"
 #include "mlx/backend/metal/device.h"

@@ -13,24 +15,28 @@ MTL::ComputePipelineState* get_arange_kernel(
 MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& out);
+    Dtype out_type,
+    const std::string op);

 MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& in,
-    const array& out);
+    Dtype in_type,
+    Dtype out_type,
+    const std::string op);

 MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& in,
-    const array& out);
+    Dtype in_type,
+    Dtype out_type,
+    const std::string op);

 MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array& out);
+    Dtype type,
+    const std::string op);

 MTL::ComputePipelineState* get_copy_kernel(
    metal::Device& d,
@@ -49,6 +55,7 @@ MTL::ComputePipelineState* get_scan_kernel(
    const std::string& kernel_name,
    bool reverse,
    bool inclusive,
+    const std::string& reduce_type,
    const array& in,
    const array& out);

@@ -154,4 +161,38 @@ MTL::ComputePipelineState* get_steel_conv_general_kernel(
    int wm,
    int wn);

+MTL::ComputePipelineState* get_fft_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const std::string& template_def);
+
+MTL::ComputePipelineState* get_quantized_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& template_def);
+
+// Create a GPU kernel template definition for JIT compilation
+template <typename... Args>
+std::string
+get_template_definition(std::string name, std::string func, Args... args) {
+  std::ostringstream s;
+  s << func << "<";
+  bool first = true;
+  auto add_arg = [&s, &first](const auto& arg) {
+    if (!first) {
+      s << ", ";
+    }
+    first = false;
+    s << arg;
+  };
+  (add_arg(args), ...);
+  s << ">";
+  std::string base_string = R"(
+template [[host_name("{0}")]] [[kernel]] decltype({1}) {1};
+  )";
+  return fmt::format(base_string, name, s.str());
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -1,66 +1,14 @@
 set(
-  HEADERS
+  BASE_HEADERS
  bf16.h
  bf16_math.h
  complex.h
  defines.h
  utils.h
-  steel/conv/params.h
 )

-set(
-  KERNELS
-  "arg_reduce"
-  "conv"
-  "fft"
-  "gemv"
-  "quantized"
-  "random"
-  "rms_norm"
-  "layer_norm"
-  "rope"
-  "scaled_dot_product_attention"
-)
-
-if (NOT MLX_METAL_JIT)
-set(
-  KERNELS
-  ${KERNELS}
-  "arange"
-  "binary"
-  "binary_two"
-  "unary"
-  "ternary"
-  "copy"
-  "softmax"
-  "sort"
-  "scan"
-  "reduce"
-)
-set(
-  HEADERS 
-  ${HEADERS}
-  atomic.h
-  arange.h
-  unary_ops.h
-  unary.h
-  binary_ops.h
-  binary.h
-  ternary.h
-  copy.h
-  softmax.h
-  sort.h
-  scan.h
-  reduction/ops.h
-  reduction/reduce_init.h
-  reduction/reduce_all.h
-  reduction/reduce_col.h
-  reduction/reduce_row.h
-)
-endif()
-
 function(build_kernel_base TARGET SRCFILE DEPS)
-  set(METAL_FLAGS -Wall -Wextra -fno-fast-math -D${MLX_METAL_VERSION})
+  set(METAL_FLAGS -Wall -Wextra -fno-fast-math)
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS}
        -gline-tables-only
@@ -72,7 +20,7 @@ function(build_kernel_base TARGET SRCFILE DEPS)
                  -c ${SRCFILE}
                  -I${PROJECT_SOURCE_DIR}
                  -o ${TARGET}.air
-    DEPENDS ${SRCFILE} ${DEPS}
+    DEPENDS ${SRCFILE} ${DEPS} ${BASE_HEADERS}
    OUTPUT ${TARGET}.air
    COMMENT "Building ${TARGET}.air"
    VERBATIM
@@ -81,49 +29,100 @@ endfunction(build_kernel_base)

 function(build_kernel KERNEL)
  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
-  build_kernel_base(${KERNEL} ${SRCFILE} "${HEADERS}")
+  cmake_path(GET KERNEL STEM TARGET)
+  build_kernel_base(${TARGET} ${SRCFILE} "${ARGN}")
+  set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR} PARENT_SCOPE)
 endfunction(build_kernel)

-foreach(KERNEL ${KERNELS})
-  build_kernel(${KERNEL})
-  set(KERNEL_AIR ${KERNEL}.air ${KERNEL_AIR})
-endforeach()
+build_kernel(arg_reduce)
+build_kernel(conv steel/conv/params.h)
+build_kernel(gemv steel/utils.h)
+build_kernel(gemv_masked steel/utils.h)
+build_kernel(layer_norm)
+build_kernel(random)
+build_kernel(rms_norm)
+build_kernel(rope)
+build_kernel(
+  scaled_dot_product_attention
+  scaled_dot_product_attention_params.h
+  steel/defines.h
+  steel/gemm/transforms.h
+  steel/utils.h
+)
+
+set(
+  STEEL_HEADERS 
+  steel/defines.h
+  steel/utils.h
+  steel/conv/conv.h
+  steel/conv/loader.h
+  steel/conv/loaders/loader_channel_l.h
+  steel/conv/loaders/loader_channel_n.h
+  steel/conv/loaders/loader_general.h
+  steel/conv/kernels/steel_conv.h
+  steel/conv/kernels/steel_conv_general.h
+  steel/gemm/gemm.h
+  steel/gemm/mma.h
+  steel/gemm/loader.h
+  steel/gemm/transforms.h
+  steel/gemm/kernels/steel_gemm_fused.h
+  steel/gemm/kernels/steel_gemm_masked.h
+  steel/gemm/kernels/steel_gemm_splitk.h
+)

 if (NOT MLX_METAL_JIT)
-  set(
-    STEEL_KERNELS
-    ${CMAKE_CURRENT_SOURCE_DIR}/steel/conv/kernels/steel_conv.metal
-    ${CMAKE_CURRENT_SOURCE_DIR}/steel/conv/kernels/steel_conv_general.metal
-    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_fused.metal
-    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_masked.metal
-    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_splitk.metal
-  )
-  set(
-    STEEL_HEADERS 
-    steel/defines.h
-    steel/utils.h
-    steel/conv/conv.h
-    steel/conv/loader.h
-    steel/conv/loaders/loader_channel_l.h
-    steel/conv/loaders/loader_channel_n.h
-    steel/conv/loaders/loader_general.h
-    steel/conv/kernels/steel_conv.h
-    steel/conv/kernels/steel_conv_general.h
-    steel/gemm/gemm.h
-    steel/gemm/mma.h
-    steel/gemm/loader.h
-    steel/gemm/transforms.h
-    steel/gemm/kernels/steel_gemm_fused.h
-    steel/gemm/kernels/steel_gemm_masked.h
-    steel/gemm/kernels/steel_gemm_splitk.h
-  )
-  foreach(KERNEL ${STEEL_KERNELS})
-    cmake_path(GET KERNEL STEM TARGET)
-    build_kernel_base(${TARGET} ${KERNEL} "${STEEL_HEADERS}")
-    set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
-  endforeach()
+build_kernel(arange arange.h)
+build_kernel(binary binary.h binary_ops.h)
+build_kernel(binary_two binary_two.h)
+build_kernel(copy copy.h)
+build_kernel(
+  fft
+  fft.h
+  fft/radix.h
+  fft/readwrite.h
+)
+build_kernel(
+  reduce
+  atomic.h
+  reduction/ops.h
+  reduction/reduce_init.h
+  reduction/reduce_all.h
+  reduction/reduce_col.h
+  reduction/reduce_row.h
+)
+build_kernel(
+  quantized
+  quantized.h
+  ${STEEL_HEADERS}
+)
+build_kernel(scan scan.h)
+build_kernel(softmax softmax.h)
+build_kernel(sort sort.h)
+build_kernel(ternary ternary.h ternary_ops.h)
+build_kernel(unary unary.h unary_ops.h)
+build_kernel(
+  steel/conv/kernels/steel_conv
+  ${STEEL_HEADERS}
+)
+build_kernel(
+  steel/conv/kernels/steel_conv_general
+  ${STEEL_HEADERS}
+)
+build_kernel(
+  steel/gemm/kernels/steel_gemm_fused
+  ${STEEL_HEADERS}
+)
+build_kernel(
+  steel/gemm/kernels/steel_gemm_masked
+  ${STEEL_HEADERS}
+)
+build_kernel(
+  steel/gemm/kernels/steel_gemm_splitk
+  ${STEEL_HEADERS}
+)
 endif()

+
 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o ${MLX_METAL_PATH}/mlx.metallib
--- a/mlx/backend/metal/kernels/bf16.h
+++ b/mlx/backend/metal/kernels/bf16.h
@@ -6,7 +6,7 @@

 using namespace metal;

-#if defined METAL_3_1 || (__METAL_VERSION__ >= 310)
+#if (MLX_METAL_VERSION >= 310) || (__METAL_VERSION__ >= 310)

 typedef bfloat bfloat16_t;

--- a/mlx/backend/metal/kernels/bf16_math.h
+++ b/mlx/backend/metal/kernels/bf16_math.h
@@ -369,7 +369,7 @@ instantiate_metal_math_funcs(
    return static_cast<otype>(__metal_simd_xor(static_cast<ctype>(data)));     \
  }

-#if defined METAL_3_1 || (__METAL_VERSION__ >= 310)
+#if (MLX_METAL_VERSION >= 310) || (__METAL_VERSION__ >= 310)

 #define bfloat16_to_uint16(x) as_type<uint16_t>(x)
 #define uint16_to_bfloat16(x) as_type<bfloat16_t>(x)
--- a/mlx/backend/metal/kernels/binary.metal
+++ b/mlx/backend/metal/kernels/binary.metal
@@ -4,148 +4,91 @@
 #include <metal_math>

 // clang-format off
+#include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/kernels/utils.h"
 #include "mlx/backend/metal/kernels/binary_ops.h"
 #include "mlx/backend/metal/kernels/binary.h"

-#define instantiate_binary(name, itype, otype, op, bopt)                      \
-  template                                                                    \
-      [[host_name(name)]] [[kernel]] void binary_##bopt<itype, otype, op>(    \
-          device const itype* a,                                              \
-          device const itype* b,                                              \
-          device otype* c,                                                    \
-          uint index [[thread_position_in_grid]]);
+#define instantiate_binary_all(op, tname, itype, otype)                 \
+  instantiate_kernel("ss" #op #tname, binary_ss, itype, otype, op)      \
+  instantiate_kernel("sv" #op #tname, binary_sv, itype, otype, op)      \
+  instantiate_kernel("vs" #op #tname, binary_vs, itype, otype, op)      \
+  instantiate_kernel("vv" #op #tname, binary_vv, itype, otype, op)      \
+  instantiate_kernel("gn" #op #tname, binary_g, itype, otype, op)       \
+  instantiate_kernel("g1" #op #tname, binary_g_nd1, itype, otype, op)   \
+  instantiate_kernel("g2" #op #tname, binary_g_nd2, itype, otype, op)   \
+  instantiate_kernel("g3" #op #tname, binary_g_nd3, itype, otype, op)   \
+  instantiate_kernel("g4" #op #tname, binary_g_nd, itype, otype, op, 4) \
+  instantiate_kernel("g5" #op #tname, binary_g_nd,  itype, otype, op, 5)

-#define instantiate_binary_g_dim(name, itype, otype, op, dims) \
-  template [[host_name("g" #dims name)]] [[kernel]] void       \
-  binary_g_nd<itype, otype, op, dims>(                         \
-      device const itype* a,                                   \
-      device const itype* b,                                   \
-      device otype* c,                                         \
-      constant const int shape[dims],                          \
-      constant const size_t a_strides[dims],                   \
-      constant const size_t b_strides[dims],                   \
-      uint3 index [[thread_position_in_grid]],                 \
-      uint3 grid_dim [[threads_per_grid]]);
+#define instantiate_binary_integer(op)                   \
+  instantiate_binary_all(op, uint8, uint8_t, uint8_t)    \
+  instantiate_binary_all(op, uint16, uint16_t, uint16_t) \
+  instantiate_binary_all(op, uint32, uint32_t, uint32_t) \
+  instantiate_binary_all(op, uint64, uint64_t, uint64_t) \
+  instantiate_binary_all(op, int8, int8_t, int8_t)       \
+  instantiate_binary_all(op, int16, int16_t, int16_t)    \
+  instantiate_binary_all(op, int32, int32_t, int32_t)    \
+  instantiate_binary_all(op, int64, int64_t, int64_t)

-#define instantiate_binary_g_nd(name, itype, otype, op) \
-  template [[host_name("g1" name)]] [[kernel]] void     \
-  binary_g_nd1<itype, otype, op>(                       \
-      device const itype* a,                            \
-      device const itype* b,                            \
-      device otype* c,                                  \
-      constant const size_t& a_stride,                  \
-      constant const size_t& b_stride,                  \
-      uint index [[thread_position_in_grid]]);          \
-  template [[host_name("g2" name)]] [[kernel]] void     \
-  binary_g_nd2<itype, otype, op>(                       \
-      device const itype* a,                            \
-      device const itype* b,                            \
-      device otype* c,                                  \
-      constant const size_t a_strides[2],               \
-      constant const size_t b_strides[2],               \
-      uint2 index [[thread_position_in_grid]],          \
-      uint2 grid_dim [[threads_per_grid]]);             \
-  template [[host_name("g3" name)]] [[kernel]] void     \
-  binary_g_nd3<itype, otype, op>(                       \
-      device const itype* a,                            \
-      device const itype* b,                            \
-      device otype* c,                                  \
-      constant const size_t a_strides[3],               \
-      constant const size_t b_strides[3],               \
-      uint3 index [[thread_position_in_grid]],          \
-      uint3 grid_dim [[threads_per_grid]]);             \
-  instantiate_binary_g_dim(name, itype, otype, op, 4)   \
-  instantiate_binary_g_dim(name, itype, otype, op, 5)
+#define instantiate_binary_float(op)                \
+  instantiate_binary_all(op, float16, half, half)   \
+  instantiate_binary_all(op, float32, float, float) \
+  instantiate_binary_all(op, bfloat16, bfloat16_t, bfloat16_t)

-#define instantiate_binary_g(name, itype, otype, op)                            \
-  template [[host_name("gn" name)]] [[kernel]] void binary_g<itype, otype, op>( \
-      device const itype* a,                                                    \
-      device const itype* b,                                                    \
-      device otype* c,                                                          \
-      constant const int* shape,                                                \
-      constant const size_t* a_strides,                                         \
-      constant const size_t* b_strides,                                         \
-      constant const int& ndim,                                                 \
-      uint3 index [[thread_position_in_grid]],                                  \
-      uint3 grid_dim [[threads_per_grid]]);
+#define instantiate_binary_types(op)                              \
+  instantiate_binary_all(op, bool_, bool, bool)                   \
+  instantiate_binary_integer(op)                                  \
+  instantiate_binary_all(op, complex64, complex64_t, complex64_t) \
+  instantiate_binary_float(op)

-#define instantiate_binary_all(name, tname, itype, otype, op) \
-  instantiate_binary("ss" #name #tname, itype, otype, op, ss) \
-  instantiate_binary("sv" #name #tname, itype, otype, op, sv) \
-  instantiate_binary("vs" #name #tname, itype, otype, op, vs) \
-  instantiate_binary("vv" #name #tname, itype, otype, op, vv) \
-  instantiate_binary_g(#name #tname, itype, otype, op)        \
-  instantiate_binary_g_nd(#name #tname, itype, otype, op)
+#define instantiate_binary_types_bool(op)                \
+  instantiate_binary_all(op, bool_, bool, bool)          \
+  instantiate_binary_all(op, uint8, uint8_t, bool)       \
+  instantiate_binary_all(op, uint16, uint16_t, bool)     \
+  instantiate_binary_all(op, uint32, uint32_t, bool)     \
+  instantiate_binary_all(op, uint64, uint64_t, bool)     \
+  instantiate_binary_all(op, int8, int8_t, bool)         \
+  instantiate_binary_all(op, int16, int16_t, bool)       \
+  instantiate_binary_all(op, int32, int32_t, bool)       \
+  instantiate_binary_all(op, int64, int64_t, bool)       \
+  instantiate_binary_all(op, float16, half, bool)        \
+  instantiate_binary_all(op, float32, float, bool)       \
+  instantiate_binary_all(op, bfloat16, bfloat16_t, bool) \
+  instantiate_binary_all(op, complex64, complex64_t, bool)

-#define instantiate_binary_integer(name, op)                   \
-  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op)    \
-  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op) \
-  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op) \
-  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op) \
-  instantiate_binary_all(name, int8, int8_t, int8_t, op)       \
-  instantiate_binary_all(name, int16, int16_t, int16_t, op)    \
-  instantiate_binary_all(name, int32, int32_t, int32_t, op)    \
-  instantiate_binary_all(name, int64, int64_t, int64_t, op)
-
-#define instantiate_binary_float(name, op)                \
-  instantiate_binary_all(name, float16, half, half, op)   \
-  instantiate_binary_all(name, float32, float, float, op) \
-  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op)
-
-#define instantiate_binary_types(name, op)                              \
-  instantiate_binary_all(name, bool_, bool, bool, op)                   \
-  instantiate_binary_integer(name, op)                                  \
-  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op) \
-  instantiate_binary_float(name, op)
-
-#define instantiate_binary_types_bool(name, op)                \
-  instantiate_binary_all(name, bool_, bool, bool, op)          \
-  instantiate_binary_all(name, uint8, uint8_t, bool, op)       \
-  instantiate_binary_all(name, uint16, uint16_t, bool, op)     \
-  instantiate_binary_all(name, uint32, uint32_t, bool, op)     \
-  instantiate_binary_all(name, uint64, uint64_t, bool, op)     \
-  instantiate_binary_all(name, int8, int8_t, bool, op)         \
-  instantiate_binary_all(name, int16, int16_t, bool, op)       \
-  instantiate_binary_all(name, int32, int32_t, bool, op)       \
-  instantiate_binary_all(name, int64, int64_t, bool, op)       \
-  instantiate_binary_all(name, float16, half, bool, op)        \
-  instantiate_binary_all(name, float32, float, bool, op)       \
-  instantiate_binary_all(name, bfloat16, bfloat16_t, bool, op) \
-  instantiate_binary_all(name, complex64, complex64_t, bool, op)
-
-instantiate_binary_types(add, Add)
-instantiate_binary_types(div, Divide)
-instantiate_binary_types_bool(eq, Equal)
-instantiate_binary_types_bool(ge, Greater)
-instantiate_binary_types_bool(geq, GreaterEqual)
-instantiate_binary_types_bool(le, Less)
-instantiate_binary_types_bool(leq, LessEqual)
-instantiate_binary_types_bool(neq, NotEqual)
-instantiate_binary_float(lae, LogAddExp)
-instantiate_binary_types(max, Maximum)
-instantiate_binary_types(min, Minimum)
-instantiate_binary_types(mul, Multiply)
-instantiate_binary_types(sub, Subtract)
-instantiate_binary_types(pow, Power)
-instantiate_binary_types(rem, Remainder)
-instantiate_binary_float(arctan2, ArcTan2)
+instantiate_binary_types(Add)
+instantiate_binary_types(Divide)
+instantiate_binary_types_bool(Equal)
+instantiate_binary_types_bool(Greater)
+instantiate_binary_types_bool(GreaterEqual)
+instantiate_binary_types_bool(Less)
+instantiate_binary_types_bool(LessEqual)
+instantiate_binary_types_bool(NotEqual)
+instantiate_binary_float(LogAddExp)
+instantiate_binary_types(Maximum)
+instantiate_binary_types(Minimum)
+instantiate_binary_types(Multiply)
+instantiate_binary_types(Subtract)
+instantiate_binary_types(Power)
+instantiate_binary_types(Remainder)
+instantiate_binary_float(ArcTan2)

 // NaNEqual only needed for floating point types with boolean output
-instantiate_binary_all(naneq, float16, half, bool, NaNEqual)
-instantiate_binary_all(naneq, float32, float, bool, NaNEqual)
-instantiate_binary_all(naneq, bfloat16, bfloat16_t, bool, NaNEqual)
-instantiate_binary_all(naneq, complex64, complex64_t, bool, NaNEqual)
+instantiate_binary_all(NaNEqual, float16, half, bool)
+instantiate_binary_all(NaNEqual, float32, float, bool)
+instantiate_binary_all(NaNEqual, bfloat16, bfloat16_t, bool)
+instantiate_binary_all(NaNEqual, complex64, complex64_t, bool)

-instantiate_binary_all(lor, bool_, bool, bool, LogicalOr)
-instantiate_binary_all(land, bool_, bool, bool, LogicalAnd)
+instantiate_binary_all(LogicalOr, bool_, bool, bool)
+instantiate_binary_all(LogicalAnd, bool_, bool, bool)

 // Bitwise ops only need integer types and bool (except for l/r shift)
-instantiate_binary_integer(bitwise_and, BitwiseAnd)
-instantiate_binary_all(bitwise_and, bool_, bool, bool, BitwiseAnd)
-instantiate_binary_integer(bitwise_or, BitwiseOr)
-instantiate_binary_all(bitwise_or, bool_, bool, bool, BitwiseOr)
-instantiate_binary_integer(bitwise_xor, BitwiseXor)
-instantiate_binary_all(bitwise_xor, bool_, bool, bool, BitwiseXor)
-instantiate_binary_integer(left_shift, LeftShift)
-instantiate_binary_integer(right_shift, RightShift) // clang-format on
+instantiate_binary_integer(BitwiseAnd)
+instantiate_binary_all(BitwiseAnd, bool_, bool, bool)
+instantiate_binary_integer(BitwiseOr)
+instantiate_binary_all(BitwiseOr, bool_, bool, bool)
+instantiate_binary_integer(BitwiseXor)
+instantiate_binary_all(BitwiseXor, bool_, bool, bool)
+instantiate_binary_integer(LeftShift)
+instantiate_binary_integer(RightShift) // clang-format on
--- a/mlx/backend/metal/kernels/binary_two.metal
+++ b/mlx/backend/metal/kernels/binary_two.metal
@@ -7,99 +7,34 @@
 #include "mlx/backend/metal/kernels/binary_ops.h"
 #include "mlx/backend/metal/kernels/binary_two.h"

-#define instantiate_binary(name, itype, otype, op, bopt)       \
-  template [[host_name(name)]] [[kernel]] void                 \
-      binary_##bopt<itype, otype, op>(                         \
-          device const itype* a,                               \
-          device const itype* b,                               \
-          device otype* c,                                     \
-          device otype* d,                                     \
-          uint index [[thread_position_in_grid]]);
+#define instantiate_binary_all(op, tname, itype, otype)                 \
+  instantiate_kernel("ss" #op #tname, binary_ss, itype, otype, op)      \
+  instantiate_kernel("sv" #op #tname, binary_sv, itype, otype, op)      \
+  instantiate_kernel("vs" #op #tname, binary_vs, itype, otype, op)      \
+  instantiate_kernel("vv" #op #tname, binary_vv, itype, otype, op)      \
+  instantiate_kernel("gn" #op #tname, binary_g, itype, otype, op)       \
+  instantiate_kernel("g1" #op #tname, binary_g_nd1, itype, otype, op)   \
+  instantiate_kernel("g2" #op #tname, binary_g_nd2, itype, otype, op)   \
+  instantiate_kernel("g3" #op #tname, binary_g_nd3, itype, otype, op)   \
+  instantiate_kernel("g4" #op #tname, binary_g_nd, itype, otype, op, 4) \
+  instantiate_kernel("g5" #op #tname, binary_g_nd,  itype, otype, op, 5)

-#define instantiate_binary_g_dim(name, itype, otype, op, dims)       \
-  template [[host_name("g" #dims name)]] [[kernel]] void             \
-  binary_g_nd<itype, otype, op, dims>(                               \
-      device const itype* a,                                         \
-      device const itype* b,                                         \
-      device otype* c,                                               \
-      device otype* d,                                               \
-      constant const int shape[dims],                                \
-      constant const size_t a_strides[dims],                         \
-      constant const size_t b_strides[dims],                         \
-      uint3 index [[thread_position_in_grid]],                       \
-      uint3 grid_dim [[threads_per_grid]]);
+#define instantiate_binary_float(op)                \
+  instantiate_binary_all(op, float16, half, half)   \
+  instantiate_binary_all(op, float32, float, float) \
+  instantiate_binary_all(op, bfloat16, bfloat16_t, bfloat16_t)

-#define instantiate_binary_g_nd(name, itype, otype, op)       \
-  template [[host_name("g1" name)]] [[kernel]] void           \
-  binary_g_nd1<itype, otype, op>(                             \
-      device const itype* a,                                  \
-      device const itype* b,                                  \
-      device otype* c,                                        \
-      device otype* d,                                        \
-      constant const size_t& a_stride,                        \
-      constant const size_t& b_stride,                        \
-      uint index [[thread_position_in_grid]]);                \
-  template [[host_name("g2" name)]] [[kernel]] void           \
-  binary_g_nd2<itype, otype, op>(                             \
-      device const itype* a,                                  \
-      device const itype* b,                                  \
-      device otype* c,                                        \
-      device otype* d,                                        \
-      constant const size_t a_strides[2],                     \
-      constant const size_t b_strides[2],                     \
-      uint2 index [[thread_position_in_grid]],                \
-      uint2 grid_dim [[threads_per_grid]]);                   \
-  template [[host_name("g3" name)]] [[kernel]] void           \
-  binary_g_nd3<itype, otype, op>(                             \
-      device const itype* a,                                  \
-      device const itype* b,                                  \
-      device otype* c,                                        \
-      device otype* d,                                        \
-      constant const size_t a_strides[3],                     \
-      constant const size_t b_strides[3],                     \
-      uint3 index [[thread_position_in_grid]],                \
-      uint3 grid_dim [[threads_per_grid]]);                   \
-  instantiate_binary_g_dim(name, itype, otype, op, 4)         \
-  instantiate_binary_g_dim(name, itype, otype, op, 5)
+#define instantiate_binary_types(op)                              \
+  instantiate_binary_all(op, bool_, bool, bool)                   \
+  instantiate_binary_all(op, uint8, uint8_t, uint8_t)             \
+  instantiate_binary_all(op, uint16, uint16_t, uint16_t)          \
+  instantiate_binary_all(op, uint32, uint32_t, uint32_t)          \
+  instantiate_binary_all(op, uint64, uint64_t, uint64_t)          \
+  instantiate_binary_all(op, int8, int8_t, int8_t)                \
+  instantiate_binary_all(op, int16, int16_t, int16_t)             \
+  instantiate_binary_all(op, int32, int32_t, int32_t)             \
+  instantiate_binary_all(op, int64, int64_t, int64_t)             \
+  instantiate_binary_all(op, complex64, complex64_t, complex64_t) \
+  instantiate_binary_float(op)

-#define instantiate_binary_g(name, itype, otype, op)       \
-  template [[host_name("gn" name)]] [[kernel]] void        \
-  binary_g<itype, otype, op>(                              \
-      device const itype* a,                               \
-      device const itype* b,                               \
-      device otype* c,                                     \
-      device otype* d,                                     \
-      constant const int* shape,                           \
-      constant const size_t* a_strides,                    \
-      constant const size_t* b_strides,                    \
-      constant const int& ndim,                            \
-      uint3 index [[thread_position_in_grid]],             \
-      uint3 grid_dim [[threads_per_grid]]);
-
-#define instantiate_binary_all(name, tname, itype, otype, op) \
-  instantiate_binary("ss" #name #tname, itype, otype, op, ss) \
-  instantiate_binary("sv" #name #tname, itype, otype, op, sv) \
-  instantiate_binary("vs" #name #tname, itype, otype, op, vs) \
-  instantiate_binary("vv" #name #tname, itype, otype, op, vv) \
-  instantiate_binary_g(#name #tname, itype, otype, op)    \
-  instantiate_binary_g_nd(#name #tname, itype, otype, op)
-
-#define instantiate_binary_float(name, op)                \
-  instantiate_binary_all(name, float16, half, half, op)   \
-  instantiate_binary_all(name, float32, float, float, op) \
-  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op)
-
-#define instantiate_binary_types(name, op)                              \
-  instantiate_binary_all(name, bool_, bool, bool, op)                   \
-  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op)             \
-  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op)          \
-  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op)          \
-  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op)          \
-  instantiate_binary_all(name, int8, int8_t, int8_t, op)                \
-  instantiate_binary_all(name, int16, int16_t, int16_t, op)             \
-  instantiate_binary_all(name, int32, int32_t, int32_t, op)             \
-  instantiate_binary_all(name, int64, int64_t, int64_t, op)             \
-  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op) \
-  instantiate_binary_float(name, op)
-
-instantiate_binary_types(divmod, DivMod) // clang-format on
+instantiate_binary_types(DivMod) // clang-format on
--- a/mlx/backend/metal/kernels/defines.h
+++ b/mlx/backend/metal/kernels/defines.h
@@ -13,3 +13,11 @@ static MTL_CONST constexpr int REDUCE_N_READS = 16;
 static MTL_CONST constexpr int SOFTMAX_N_READS = 4;
 static MTL_CONST constexpr int RMS_N_READS = 4;
 static MTL_CONST constexpr int RMS_LOOPED_LIMIT = 4096;
+
+// Instantiate a templated kernel.
+// Extra args are used as template parameters:
+// e.g. instantiate_kernel(binary_int, binary, a, b) ->
+// [[host_name(binary_int)]] [kernel] binary<a, b>
+#define instantiate_kernel(name, func, ...) \
+  template [[host_name(                     \
+      name)]] [[kernel]] decltype(func<__VA_ARGS__>) func<__VA_ARGS__>;
--- a/mlx/backend/metal/kernels/fft.h
+++ b/mlx/backend/metal/kernels/fft.h
@@ -0,0 +1,486 @@
+// Copyright © 2024 Apple Inc.
+
+// Metal FFT using Stockham's algorithm
+//
+// References:
+// - VkFFT (https://github.com/DTolm/VkFFT)
+// - Eric Bainville's excellent page (http://www.bealto.com/gpu-fft.html)
+
+#include <metal_common>
+
+#include "mlx/backend/metal/kernels/fft/radix.h"
+#include "mlx/backend/metal/kernels/fft/readwrite.h"
+#include "mlx/backend/metal/kernels/steel/defines.h"
+
+using namespace metal;
+
+#define MAX_RADIX 13
+// Reached when elems_per_thread_ = 6, max_radix = 13
+// and some threads have to do 3 radix 6s requiring 18 float2s.
+#define MAX_OUTPUT_SIZE 18
+
+// Specialize for a particular value of N at runtime
+STEEL_CONST bool inv_ [[function_constant(0)]];
+STEEL_CONST bool is_power_of_2_ [[function_constant(1)]];
+STEEL_CONST int elems_per_thread_ [[function_constant(2)]];
+// rader_m = n / rader_n
+STEEL_CONST int rader_m_ [[function_constant(3)]];
+// Stockham steps
+STEEL_CONST int radix_13_steps_ [[function_constant(4)]];
+STEEL_CONST int radix_11_steps_ [[function_constant(5)]];
+STEEL_CONST int radix_8_steps_ [[function_constant(6)]];
+STEEL_CONST int radix_7_steps_ [[function_constant(7)]];
+STEEL_CONST int radix_6_steps_ [[function_constant(8)]];
+STEEL_CONST int radix_5_steps_ [[function_constant(9)]];
+STEEL_CONST int radix_4_steps_ [[function_constant(10)]];
+STEEL_CONST int radix_3_steps_ [[function_constant(11)]];
+STEEL_CONST int radix_2_steps_ [[function_constant(12)]];
+// Rader steps
+STEEL_CONST int rader_13_steps_ [[function_constant(13)]];
+STEEL_CONST int rader_11_steps_ [[function_constant(14)]];
+STEEL_CONST int rader_8_steps_ [[function_constant(15)]];
+STEEL_CONST int rader_7_steps_ [[function_constant(16)]];
+STEEL_CONST int rader_6_steps_ [[function_constant(17)]];
+STEEL_CONST int rader_5_steps_ [[function_constant(18)]];
+STEEL_CONST int rader_4_steps_ [[function_constant(19)]];
+STEEL_CONST int rader_3_steps_ [[function_constant(20)]];
+STEEL_CONST int rader_2_steps_ [[function_constant(21)]];
+
+// See "radix.h" for radix codelets
+typedef void (*RadixFunc)(thread float2*, thread float2*);
+
+// Perform a single radix n butterfly with appropriate twiddles
+template <int radix, RadixFunc radix_func>
+METAL_FUNC void radix_butterfly(
+    int i,
+    int p,
+    thread float2* x,
+    thread short* indices,
+    thread float2* y) {
+  // i: the index in the overall DFT that we're processing.
+  // p: the size of the DFTs we're merging at this step.
+  // m: how many threads are working on this DFT.
+  int k, j;
+
+  // Use faster bitwise operations when working with powers of two
+  constexpr bool radix_p_2 = (radix & (radix - 1)) == 0;
+  if (radix_p_2 && is_power_of_2_) {
+    constexpr short power = __builtin_ctz(radix);
+    k = i & (p - 1);
+    j = ((i - k) << power) + k;
+  } else {
+    k = i % p;
+    j = (i / p) * radix * p + k;
+  }
+
+  // Apply twiddles
+  if (p > 1) {
+    float2 twiddle_1 = get_twiddle(k, radix * p);
+    float2 twiddle = twiddle_1;
+    x[1] = complex_mul(x[1], twiddle);
+
+    STEEL_PRAGMA_UNROLL
+    for (int t = 2; t < radix; t++) {
+      twiddle = complex_mul(twiddle, twiddle_1);
+      x[t] = complex_mul(x[t], twiddle);
+    }
+  }
+
+  radix_func(x, y);
+
+  STEEL_PRAGMA_UNROLL
+  for (int t = 0; t < radix; t++) {
+    indices[t] = j + t * p;
+  }
+}
+
+// Perform all the radix steps required for a
+// particular radix size n.
+template <int radix, RadixFunc radix_func>
+METAL_FUNC void radix_n_steps(
+    int i,
+    thread int* p,
+    int m,
+    int n,
+    int num_steps,
+    thread float2* inputs,
+    thread short* indices,
+    thread float2* values,
+    threadgroup float2* buf) {
+  int m_r = n / radix;
+  // When combining different sized radices, we have to do
+  // multiple butterflies in a single thread.
+  // E.g. n = 28 = 4 * 7
+  // 4 threads, 7 elems_per_thread
+  // All threads do 1 radix7 butterfly.
+  // 3 threads do 2 radix4 butterflies.
+  // 1 thread does 1 radix4 butterfly.
+  int max_radices_per_thread = (elems_per_thread_ + radix - 1) / radix;
+
+  int index = 0;
+  int r_index = 0;
+  for (int s = 0; s < num_steps; s++) {
+    for (int t = 0; t < max_radices_per_thread; t++) {
+      index = i + t * m;
+      if (index < m_r) {
+        for (int r = 0; r < radix; r++) {
+          inputs[r] = buf[index + r * m_r];
+        }
+        radix_butterfly<radix, radix_func>(
+            index, *p, inputs, indices + t * radix, values + t * radix);
+      }
+    }
+
+    // Wait until all threads have read their inputs into thread local mem
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (int t = 0; t < max_radices_per_thread; t++) {
+      index = i + t * m;
+      if (index < m_r) {
+        for (int r = 0; r < radix; r++) {
+          r_index = t * radix + r;
+          buf[indices[r_index]] = values[r_index];
+        }
+      }
+    }
+
+    // Wait until all threads have written back to threadgroup mem
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    *p *= radix;
+  }
+}
+
+#define RADIX_STEP(radix, radix_func, num_steps) \
+  radix_n_steps<radix, radix_func>(              \
+      fft_idx, p, m, n, num_steps, inputs, indices, values, buf);
+
+template <bool rader = false>
+METAL_FUNC void
+perform_fft(int fft_idx, thread int* p, int m, int n, threadgroup float2* buf) {
+  float2 inputs[MAX_RADIX];
+  short indices[MAX_OUTPUT_SIZE];
+  float2 values[MAX_OUTPUT_SIZE];
+
+  RADIX_STEP(2, radix2, rader ? rader_2_steps_ : radix_2_steps_);
+  RADIX_STEP(3, radix3, rader ? rader_3_steps_ : radix_3_steps_);
+  RADIX_STEP(4, radix4, rader ? rader_4_steps_ : radix_4_steps_);
+  RADIX_STEP(5, radix5, rader ? rader_5_steps_ : radix_5_steps_);
+  RADIX_STEP(6, radix6, rader ? rader_6_steps_ : radix_6_steps_);
+  RADIX_STEP(7, radix7, rader ? rader_7_steps_ : radix_7_steps_);
+  RADIX_STEP(8, radix8, rader ? rader_8_steps_ : radix_8_steps_);
+  RADIX_STEP(11, radix11, rader ? rader_11_steps_ : radix_11_steps_);
+  RADIX_STEP(13, radix13, rader ? rader_13_steps_ : radix_13_steps_);
+}
+
+// Each FFT is computed entirely in shared GPU memory.
+//
+// N is decomposed into radix-n DFTs:
+// e.g. 128 = 2 * 4 * 4 * 4
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    constant const int& n,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  threadgroup float2 shared_in[tg_mem_size];
+
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load();
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  int p = 1;
+  int fft_idx = elem.z; // Thread index in DFT
+  int m = grid.z; // Threads per DFT
+  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
+  threadgroup float2* buf = &shared_in[tg_idx];
+
+  perform_fft(fft_idx, &p, m, n, buf);
+
+  read_writer.write();
+}
+
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void rader_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    const device float2* raders_b_q [[buffer(2)]],
+    const device short* raders_g_q [[buffer(3)]],
+    const device short* raders_g_minus_q [[buffer(4)]],
+    constant const int& n,
+    constant const int& batch_size,
+    constant const int& rader_n,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Use Rader's algorithm to compute fast FFTs
+  // when a prime factor `p` of `n` is greater than 13 but
+  // has `p - 1` Stockham decomposable into to prime factors <= 13.
+  //
+  // E.g. n = 102
+  //        = 2 * 3 * 17
+  // .      = 2 * 3 * RADER(16)
+  // .      = 2 * 3 * RADER(4 * 4)
+  //
+  // In numpy:
+  //   x_perm = x[g_q]
+  //   y = np.fft.fft(x_perm) * b_q
+  //   z = np.fft.ifft(y) + x[0]
+  //   out = z[g_minus_q]
+  //   out[0]  = x[1:].sum()
+  //
+  // Where the g_q and g_minus_q are permutations formed
+  // by the group under multiplicative modulo N using the
+  // primitive root of N and b_q is a constant.
+  // See https://en.wikipedia.org/wiki/Rader%27s_FFT_algorithm
+  //
+  // Rader's uses fewer operations than Bluestein's and so
+  // is more accurate. It's also faster in most cases.
+  threadgroup float2 shared_in[tg_mem_size];
+
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load();
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // The number of the threads we're using for each DFT
+  int m = grid.z;
+
+  int fft_idx = elem.z;
+  int tg_idx = elem.y * n;
+  threadgroup float2* buf = &shared_in[tg_idx];
+
+  // rader_m = n / rader_n;
+  int rader_m = rader_m_;
+
+  // We have to load two x_0s for each thread since sometimes
+  // elems_per_thread_ crosses a boundary.
+  // E.g. with n = 34, rader_n = 17, elems_per_thread_ = 4
+  // 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 8 8
+  // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+  short x_0_index =
+      metal::min(fft_idx * elems_per_thread_ / (rader_n - 1), rader_m - 1);
+  float2 x_0[2] = {buf[x_0_index], buf[x_0_index + 1]};
+
+  // Do the Rader permutation in shared memory
+  float2 temp[MAX_RADIX];
+  int max_index = n - rader_m - 1;
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short g_q = raders_g_q[index / rader_m];
+    temp[e] = buf[rader_m + (g_q - 1) * rader_m + index % rader_m];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    buf[index + rader_m] = temp[e];
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // Rader FFT on x[rader_m:]
+  int p = 1;
+  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);
+
+  // x_1 + ... + x_n is computed for us in the first FFT step so
+  // we save it in the first rader_m indices of the array for later.
+  int x_sum_index = metal::min(fft_idx, rader_m - 1);
+  buf[x_sum_index] = buf[rader_m + x_sum_index * (rader_n - 1)];
+
+  float2 inv = {1.0f, -1.0f};
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short interleaved_index =
+        index / rader_m + (index % rader_m) * (rader_n - 1);
+    temp[e] = complex_mul(
+        buf[rader_m + interleaved_index],
+        raders_b_q[interleaved_index % (rader_n - 1)]);
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    buf[rader_m + index] = temp[e] * inv;
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // Rader IFFT on x[rader_m:]
+  p = 1;
+  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);
+
+  float2 rader_inv_factor = {1.0f / (rader_n - 1), -1.0f / (rader_n - 1)};
+
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, n - rader_m - 1);
+    short diff_index = index / (rader_n - 1) - x_0_index;
+    temp[e] = buf[rader_m + index] * rader_inv_factor + x_0[diff_index];
+  }
+
+  // Use the sum of elements that was computed in the first FFT
+  float2 x_sum = buf[x_0_index] + x_0[0];
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short g_q_index = index % (rader_n - 1);
+    short g_q = raders_g_minus_q[g_q_index];
+    short out_index = index - g_q_index + g_q + (index / (rader_n - 1));
+    buf[out_index] = temp[e];
+  }
+
+  buf[x_0_index * rader_n] = x_sum;
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  p = rader_n;
+  perform_fft(fft_idx, &p, m, n, buf);
+
+  read_writer.write();
+}
+
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void bluestein_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    const device float2* w_q [[buffer(2)]],
+    const device float2* w_k [[buffer(3)]],
+    constant const int& length,
+    constant const int& n,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Computes arbitrary length FFTs with Bluestein's algorithm
+  //
+  // In numpy:
+  //   bluestein_n = next_power_of_2(2*n - 1)
+  //   out = w_k * np.fft.ifft(np.fft.fft(w_k * in, bluestein_n) * w_q)
+  //
+  // Where w_k and w_q are precomputed on CPU in high precision as:
+  //   w_k = np.exp(-1j * np.pi / n * (np.arange(-n + 1, n) ** 2))
+  //   w_q = np.fft.fft(1/w_k[-n:])
+  threadgroup float2 shared_in[tg_mem_size];
+
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load_padded(length, w_k);
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  int p = 1;
+  int fft_idx = elem.z; // Thread index in DFT
+  int m = grid.z; // Threads per DFT
+  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
+  threadgroup float2* buf = &shared_in[tg_idx];
+
+  // fft
+  perform_fft(fft_idx, &p, m, n, buf);
+
+  float2 inv = float2(1.0f, -1.0f);
+  for (int t = 0; t < elems_per_thread_; t++) {
+    int index = fft_idx + t * m;
+    buf[index] = complex_mul(buf[index], w_q[index]) * inv;
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // ifft
+  p = 1;
+  perform_fft(fft_idx, &p, m, n, buf);
+
+  read_writer.write_padded(length, w_k);
+}
+
+template <
+    int tg_mem_size,
+    typename in_T,
+    typename out_T,
+    int step,
+    bool real = false>
+[[kernel]] void four_step_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    constant const int& n1,
+    constant const int& n2,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Fast four step FFT implementation for powers of 2.
+  int overall_n = n1 * n2;
+  int n = step == 0 ? n1 : n2;
+  int stride = step == 0 ? n2 : n1;
+
+  // The number of the threads we're using for each DFT
+  int m = grid.z;
+  int fft_idx = elem.z;
+
+  threadgroup float2 shared_in[tg_mem_size];
+  threadgroup float2* buf = &shared_in[elem.y * n];
+
+  using read_writer_t = ReadWriter<in_T, out_T, step, real>;
+  read_writer_t read_writer = read_writer_t(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load_strided(stride, overall_n);
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  int p = 1;
+  perform_fft(fft_idx, &p, m, n, buf);
+
+  read_writer.write_strided(stride, overall_n);
+}
--- a/mlx/backend/metal/kernels/fft.metal
+++ b/mlx/backend/metal/kernels/fft.metal
@@ -1,199 +1,67 @@
 // Copyright © 2024 Apple Inc.

-// Metal FFT using Stockham's algorithm
-//
-// References:
-// - VkFFT (https://github.com/DTolm/VkFFT)
-// - Eric Bainville's excellent page (http://www.bealto.com/gpu-fft.html)
-
-#include <metal_common>
-#include <metal_math>
-
 #include "mlx/backend/metal/kernels/defines.h"
-#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/fft.h"

-using namespace metal;
+#define instantiate_fft(tg_mem_size, in_T, out_T)   \
+  instantiate_kernel(                               \
+      "fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
+      fft,                                          \
+      tg_mem_size,                                  \
+      in_T,                                         \
+      out_T)

-float2 complex_mul(float2 a, float2 b) {
-  float2 c;
-  c.x = a.x * b.x - a.y * b.y;
-  c.y = a.x * b.y + a.y * b.x;
-  return c;
-}
+#define instantiate_rader(tg_mem_size, in_T, out_T)       \
+  instantiate_kernel(                                     \
+      "rader_fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
+      rader_fft,                                          \
+      tg_mem_size,                                        \
+      in_T,                                               \
+      out_T)

-float2 get_twiddle(int k, int p) {
-  float theta = -1.0f * k * M_PI_F / (2 * p);
+#define instantiate_bluestein(tg_mem_size, in_T, out_T)       \
+  instantiate_kernel(                                         \
+      "bluestein_fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
+      bluestein_fft,                                          \
+      tg_mem_size,                                            \
+      in_T,                                                   \
+      out_T)

-  float2 twiddle;
-  twiddle.x = metal::fast::cos(theta);
-  twiddle.y = metal::fast::sin(theta);
-  return twiddle;
-}
+#define instantiate_four_step(tg_mem_size, in_T, out_T, step, real)           \
+  instantiate_kernel(                                                         \
+      "four_step_mem_" #tg_mem_size "_" #in_T "_" #out_T "_" #step "_" #real, \
+      four_step_fft,                                                          \
+      tg_mem_size,                                                            \
+      in_T,                                                                   \
+      out_T,                                                                  \
+      step,                                                                   \
+      real)

-// single threaded radix2 implemetation
-void radix2(
-    int i,
-    int p,
-    int m,
-    threadgroup float2* read_buf,
-    threadgroup float2* write_buf) {
-  float2 x_0 = read_buf[i];
-  float2 x_1 = read_buf[i + m];
-
-  // The index within this sub-DFT
-  int k = i & (p - 1);
-
-  float2 twiddle = get_twiddle(k, p);
-
-  float2 z = complex_mul(x_1, twiddle);
-
-  float2 y_0 = x_0 + z;
-  float2 y_1 = x_0 - z;
-
-  int j = (i << 1) - k;
-
-  write_buf[j] = y_0;
-  write_buf[j + p] = y_1;
-}
-
-// single threaded radix4 implemetation
-void radix4(
-    int i,
-    int p,
-    int m,
-    threadgroup float2* read_buf,
-    threadgroup float2* write_buf) {
-  float2 x_0 = read_buf[i];
-  float2 x_1 = read_buf[i + m];
-  float2 x_2 = read_buf[i + 2 * m];
-  float2 x_3 = read_buf[i + 3 * m];
-
-  // The index within this sub-DFT
-  int k = i & (p - 1);
-
-  float2 twiddle = get_twiddle(k, p);
-  // e^a * e^b = e^(a + b)
-  float2 twiddle_2 = complex_mul(twiddle, twiddle);
-  float2 twiddle_3 = complex_mul(twiddle, twiddle_2);
-
-  x_1 = complex_mul(x_1, twiddle);
-  x_2 = complex_mul(x_2, twiddle_2);
-  x_3 = complex_mul(x_3, twiddle_3);
-
-  float2 minus_i;
-  minus_i.x = 0;
-  minus_i.y = -1;
-
-  // Hard coded twiddle factors for DFT4
-  float2 z_0 = x_0 + x_2;
-  float2 z_1 = x_0 - x_2;
-  float2 z_2 = x_1 + x_3;
-  float2 z_3 = complex_mul(x_1 - x_3, minus_i);
-
-  float2 y_0 = z_0 + z_2;
-  float2 y_1 = z_1 + z_3;
-  float2 y_2 = z_0 - z_2;
-  float2 y_3 = z_1 - z_3;
-
-  int j = ((i - k) << 2) + k;
-
-  write_buf[j] = y_0;
-  write_buf[j + p] = y_1;
-  write_buf[j + 2 * p] = y_2;
-  write_buf[j + 3 * p] = y_3;
-}
-
-// Each FFT is computed entirely in shared GPU memory.
-//
-// N is decomposed into radix-2 and radix-4 DFTs:
-// e.g. 128 = 2 * 4 * 4 * 4
-//
-// At each step we use n / 4 threads, each performing
-// a single-threaded radix-4 or radix-2 DFT.
-//
-// We provide the number of radix-2 and radix-4
-// steps at compile time for a ~20% performance boost.
-template <size_t n, size_t radix_2_steps, size_t radix_4_steps>
-[[kernel]] void fft(
-    const device float2* in [[buffer(0)]],
-    device float2* out [[buffer(1)]],
-    uint3 thread_position_in_grid [[thread_position_in_grid]],
-    uint3 threads_per_grid [[threads_per_grid]]) {
-  // Index of the DFT in batch
-  int batch_idx = thread_position_in_grid.x * n;
-  // The index in the DFT we're working on
-  int i = thread_position_in_grid.y;
-  // The number of the threads we're using for each DFT
-  int m = threads_per_grid.y;
-
-  // Allocate 2 shared memory buffers for Stockham.
-  // We alternate reading from one and writing to the other at each radix step.
-  threadgroup float2 shared_in[n];
-  threadgroup float2 shared_out[n];
-
-  // Pointers to facilitate Stockham buffer swapping
-  threadgroup float2* read_buf = shared_in;
-  threadgroup float2* write_buf = shared_out;
-  threadgroup float2* tmp;
-
-  // Copy input into shared memory
-  shared_in[i] = in[batch_idx + i];
-  shared_in[i + m] = in[batch_idx + i + m];
-  shared_in[i + 2 * m] = in[batch_idx + i + 2 * m];
-  shared_in[i + 3 * m] = in[batch_idx + i + 3 * m];
-
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  int p = 1;
-
-  for (size_t r = 0; r < radix_2_steps; r++) {
-    radix2(i, p, m * 2, read_buf, write_buf);
-    radix2(i + m, p, m * 2, read_buf, write_buf);
-    p *= 2;
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Stockham switch of buffers
-    tmp = write_buf;
-    write_buf = read_buf;
-    read_buf = tmp;
-  }
-
-  for (size_t r = 0; r < radix_4_steps; r++) {
-    radix4(i, p, m, read_buf, write_buf);
-    p *= 4;
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Stockham switch of buffers
-    tmp = write_buf;
-    write_buf = read_buf;
-    read_buf = tmp;
-  }
-
-  // Copy shared memory to output
-  out[batch_idx + i] = read_buf[i];
-  out[batch_idx + i + m] = read_buf[i + m];
-  out[batch_idx + i + 2 * m] = read_buf[i + 2 * m];
-  out[batch_idx + i + 3 * m] = read_buf[i + 3 * m];
-}
-
-#define instantiate_fft(name, n, radix_2_steps, radix_4_steps)   \
-  template [[host_name("fft_" #name)]] [[kernel]] void           \
-  fft<n, radix_2_steps, radix_4_steps>(                          \
-      const device float2* in [[buffer(0)]],                     \
-      device float2* out [[buffer(1)]],                          \
-      uint3 thread_position_in_grid [[thread_position_in_grid]], \
-      uint3 threads_per_grid [[threads_per_grid]]);
-
-// Explicitly define kernels for each power of 2.
 // clang-format off
-instantiate_fft(4, /* n= */ 4, /* radix_2_steps= */ 0, /* radix_4_steps= */ 1)
-instantiate_fft(8, 8, 1, 1) instantiate_fft(16, 16, 0, 2)
-instantiate_fft(32, 32, 1, 2) instantiate_fft(64, 64, 0, 3)
-instantiate_fft(128, 128, 1, 3) instantiate_fft(256, 256, 0, 4)
-instantiate_fft(512, 512, 1, 4)
-instantiate_fft(1024, 1024, 0, 5)
-// 2048 is the max that will fit into 32KB of threadgroup memory.
-// TODO: implement 4 step FFT for larger n.
-instantiate_fft(2048, 2048, 1, 5) // clang-format on
+#define instantiate_ffts(tg_mem_size)                        \
+  instantiate_fft(tg_mem_size, float2, float2) \
+  instantiate_fft(tg_mem_size, float, float2) \
+  instantiate_fft(tg_mem_size, float2, float) \
+  instantiate_rader(tg_mem_size, float2, float2) \
+  instantiate_rader(tg_mem_size, float, float2) \
+  instantiate_rader(tg_mem_size, float2, float) \
+  instantiate_bluestein(tg_mem_size, float2, float2) \
+  instantiate_bluestein(tg_mem_size, float, float2) \
+  instantiate_bluestein(tg_mem_size, float2, float) \
+  instantiate_four_step(tg_mem_size, float2, float2, 0, /*real=*/false) \
+  instantiate_four_step(tg_mem_size, float2, float2, 1, /*real=*/false) \
+  instantiate_four_step(tg_mem_size, float, float2, 0, /*real=*/true) \
+  instantiate_four_step(tg_mem_size, float2, float2, 1, /*real=*/true) \
+  instantiate_four_step(tg_mem_size, float2, float2, 0, /*real=*/true) \
+  instantiate_four_step(tg_mem_size, float2, float, 1, /*real=*/true)
+
+// It's substantially faster to statically define the
+// threadgroup memory size rather than using
+// `setThreadgroupMemoryLength` on the compute encoder.
+// For non-power of 2 sizes we round up the shared memory.
+instantiate_ffts(256)
+instantiate_ffts(512)
+instantiate_ffts(1024)
+instantiate_ffts(2048)
+// 4096 is the max that will fit into 32KB of threadgroup memory.
+instantiate_ffts(4096) // clang-format on
--- a/mlx/backend/metal/kernels/fft/radix.h
+++ b/mlx/backend/metal/kernels/fft/radix.h
@@ -0,0 +1,328 @@
+// Copyright © 2024 Apple Inc.
+
+/* Radix kernels
+
+We provide optimized, single threaded Radix codelets
+for n=2,3,4,5,6,7,8,10,11,12,13.
+
+For n=2,3,4,5,6 we hand write the codelets.
+For n=8,10,12 we combine smaller codelets.
+For n=7,11,13 we use Rader's algorithm which decomposes
+them into (n-1)=6,10,12 codelets. */
+
+#pragma once
+
+#include <metal_common>
+#include <metal_math>
+#include <metal_stdlib>
+
+METAL_FUNC float2 complex_mul(float2 a, float2 b) {
+  return float2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+}
+
+// Complex mul followed by conjugate
+METAL_FUNC float2 complex_mul_conj(float2 a, float2 b) {
+  return float2(a.x * b.x - a.y * b.y, -a.x * b.y - a.y * b.x);
+}
+
+// Compute an FFT twiddle factor
+METAL_FUNC float2 get_twiddle(int k, int p) {
+  float theta = -2.0f * k * M_PI_F / p;
+
+  float2 twiddle = {metal::fast::cos(theta), metal::fast::sin(theta)};
+  return twiddle;
+}
+
+METAL_FUNC void radix2(thread float2* x, thread float2* y) {
+  y[0] = x[0] + x[1];
+  y[1] = x[0] - x[1];
+}
+
+METAL_FUNC void radix3(thread float2* x, thread float2* y) {
+  float pi_2_3 = -0.8660254037844387;
+
+  float2 a_1 = x[1] + x[2];
+  float2 a_2 = x[1] - x[2];
+
+  y[0] = x[0] + a_1;
+  float2 b_1 = x[0] - 0.5 * a_1;
+  float2 b_2 = pi_2_3 * a_2;
+
+  float2 b_2_j = {-b_2.y, b_2.x};
+  y[1] = b_1 + b_2_j;
+  y[2] = b_1 - b_2_j;
+}
+
+METAL_FUNC void radix4(thread float2* x, thread float2* y) {
+  float2 z_0 = x[0] + x[2];
+  float2 z_1 = x[0] - x[2];
+  float2 z_2 = x[1] + x[3];
+  float2 z_3 = x[1] - x[3];
+  float2 z_3_i = {z_3.y, -z_3.x};
+
+  y[0] = z_0 + z_2;
+  y[1] = z_1 + z_3_i;
+  y[2] = z_0 - z_2;
+  y[3] = z_1 - z_3_i;
+}
+
+METAL_FUNC void radix5(thread float2* x, thread float2* y) {
+  float2 root_5_4 = 0.5590169943749475;
+  float2 sin_2pi_5 = 0.9510565162951535;
+  float2 sin_1pi_5 = 0.5877852522924731;
+
+  float2 a_1 = x[1] + x[4];
+  float2 a_2 = x[2] + x[3];
+  float2 a_3 = x[1] - x[4];
+  float2 a_4 = x[2] - x[3];
+
+  float2 a_5 = a_1 + a_2;
+  float2 a_6 = root_5_4 * (a_1 - a_2);
+  float2 a_7 = x[0] - a_5 / 4;
+  float2 a_8 = a_7 + a_6;
+  float2 a_9 = a_7 - a_6;
+  float2 a_10 = sin_2pi_5 * a_3 + sin_1pi_5 * a_4;
+  float2 a_11 = sin_1pi_5 * a_3 - sin_2pi_5 * a_4;
+  float2 a_10_j = {a_10.y, -a_10.x};
+  float2 a_11_j = {a_11.y, -a_11.x};
+
+  y[0] = x[0] + a_5;
+  y[1] = a_8 + a_10_j;
+  y[2] = a_9 + a_11_j;
+  y[3] = a_9 - a_11_j;
+  y[4] = a_8 - a_10_j;
+}
+
+METAL_FUNC void radix6(thread float2* x, thread float2* y) {
+  float sin_pi_3 = 0.8660254037844387;
+  float2 a_1 = x[2] + x[4];
+  float2 a_2 = x[0] - a_1 / 2;
+  float2 a_3 = sin_pi_3 * (x[2] - x[4]);
+  float2 a_4 = x[5] + x[1];
+  float2 a_5 = x[3] - a_4 / 2;
+  float2 a_6 = sin_pi_3 * (x[5] - x[1]);
+  float2 a_7 = x[0] + a_1;
+
+  float2 a_3_i = {a_3.y, -a_3.x};
+  float2 a_6_i = {a_6.y, -a_6.x};
+  float2 a_8 = a_2 + a_3_i;
+  float2 a_9 = a_2 - a_3_i;
+  float2 a_10 = x[3] + a_4;
+  float2 a_11 = a_5 + a_6_i;
+  float2 a_12 = a_5 - a_6_i;
+
+  y[0] = a_7 + a_10;
+  y[1] = a_8 - a_11;
+  y[2] = a_9 + a_12;
+  y[3] = a_7 - a_10;
+  y[4] = a_8 + a_11;
+  y[5] = a_9 - a_12;
+}
+
+METAL_FUNC void radix7(thread float2* x, thread float2* y) {
+  // Rader's algorithm
+  float2 inv = {1 / 6.0, -1 / 6.0};
+
+  // fft
+  float2 in1[6] = {x[1], x[3], x[2], x[6], x[4], x[5]};
+  radix6(in1, y + 1);
+
+  y[0] = y[1] + x[0];
+
+  // b_q
+  y[1] = complex_mul_conj(y[1], float2(-1, 0));
+  y[2] = complex_mul_conj(y[2], float2(2.44013336, -1.02261879));
+  y[3] = complex_mul_conj(y[3], float2(2.37046941, -1.17510629));
+  y[4] = complex_mul_conj(y[4], float2(0, -2.64575131));
+  y[5] = complex_mul_conj(y[5], float2(2.37046941, 1.17510629));
+  y[6] = complex_mul_conj(y[6], float2(-2.44013336, -1.02261879));
+
+  // ifft
+  radix6(y + 1, x + 1);
+
+  y[1] = x[1] * inv + x[0];
+  y[5] = x[2] * inv + x[0];
+  y[4] = x[3] * inv + x[0];
+  y[6] = x[4] * inv + x[0];
+  y[2] = x[5] * inv + x[0];
+  y[3] = x[6] * inv + x[0];
+}
+
+METAL_FUNC void radix8(thread float2* x, thread float2* y) {
+  float cos_pi_4 = 0.7071067811865476;
+  float2 w_0 = {cos_pi_4, -cos_pi_4};
+  float2 w_1 = {-cos_pi_4, -cos_pi_4};
+  float2 temp[8] = {x[0], x[2], x[4], x[6], x[1], x[3], x[5], x[7]};
+  radix4(temp, x);
+  radix4(temp + 4, x + 4);
+
+  y[0] = x[0] + x[4];
+  y[4] = x[0] - x[4];
+  float2 x_5 = complex_mul(x[5], w_0);
+  y[1] = x[1] + x_5;
+  y[5] = x[1] - x_5;
+  float2 x_6 = {x[6].y, -x[6].x};
+  y[2] = x[2] + x_6;
+  y[6] = x[2] - x_6;
+  float2 x_7 = complex_mul(x[7], w_1);
+  y[3] = x[3] + x_7;
+  y[7] = x[3] - x_7;
+}
+
+template <bool raders_perm>
+METAL_FUNC void radix10(thread float2* x, thread float2* y) {
+  float2 w[4];
+  w[0] = {0.8090169943749475, -0.5877852522924731};
+  w[1] = {0.30901699437494745, -0.9510565162951535};
+  w[2] = {-w[1].x, w[1].y};
+  w[3] = {-w[0].x, w[0].y};
+
+  if (raders_perm) {
+    float2 temp[10] = {
+        x[0], x[3], x[4], x[8], x[2], x[1], x[7], x[9], x[6], x[5]};
+    radix5(temp, x);
+    radix5(temp + 5, x + 5);
+  } else {
+    float2 temp[10] = {
+        x[0], x[2], x[4], x[6], x[8], x[1], x[3], x[5], x[7], x[9]};
+    radix5(temp, x);
+    radix5(temp + 5, x + 5);
+  }
+
+  y[0] = x[0] + x[5];
+  y[5] = x[0] - x[5];
+  for (int t = 1; t < 5; t++) {
+    float2 a = complex_mul(x[t + 5], w[t - 1]);
+    y[t] = x[t] + a;
+    y[t + 5] = x[t] - a;
+  }
+}
+
+METAL_FUNC void radix11(thread float2* x, thread float2* y) {
+  // Raders Algorithm
+  float2 inv = {1 / 10.0, -1 / 10.0};
+
+  // fft
+  radix10<true>(x + 1, y + 1);
+
+  y[0] = y[1] + x[0];
+
+  // b_q
+  y[1] = complex_mul_conj(y[1], float2(-1, 0));
+  y[2] = complex_mul_conj(y[2], float2(0.955301878, -3.17606649));
+  y[3] = complex_mul_conj(y[3], float2(2.63610556, 2.01269656));
+  y[4] = complex_mul_conj(y[4], float2(2.54127802, 2.13117479));
+  y[5] = complex_mul_conj(y[5], float2(2.07016210, 2.59122150));
+  y[6] = complex_mul_conj(y[6], float2(0, -3.31662479));
+  y[7] = complex_mul_conj(y[7], float2(2.07016210, -2.59122150));
+  y[8] = complex_mul_conj(y[8], float2(-2.54127802, 2.13117479));
+  y[9] = complex_mul_conj(y[9], float2(2.63610556, -2.01269656));
+  y[10] = complex_mul_conj(y[10], float2(-0.955301878, -3.17606649));
+
+  // ifft
+  radix10<false>(y + 1, x + 1);
+
+  y[1] = x[1] * inv + x[0];
+  y[6] = x[2] * inv + x[0];
+  y[3] = x[3] * inv + x[0];
+  y[7] = x[4] * inv + x[0];
+  y[9] = x[5] * inv + x[0];
+  y[10] = x[6] * inv + x[0];
+  y[5] = x[7] * inv + x[0];
+  y[8] = x[8] * inv + x[0];
+  y[4] = x[9] * inv + x[0];
+  y[2] = x[10] * inv + x[0];
+}
+
+template <bool raders_perm>
+METAL_FUNC void radix12(thread float2* x, thread float2* y) {
+  float2 w[6];
+  float sin_pi_3 = 0.8660254037844387;
+  w[0] = {sin_pi_3, -0.5};
+  w[1] = {0.5, -sin_pi_3};
+  w[2] = {0, -1};
+  w[3] = {-0.5, -sin_pi_3};
+  w[4] = {-sin_pi_3, -0.5};
+
+  if (raders_perm) {
+    float2 temp[12] = {
+        x[0],
+        x[3],
+        x[2],
+        x[11],
+        x[8],
+        x[9],
+        x[1],
+        x[7],
+        x[5],
+        x[10],
+        x[4],
+        x[6]};
+    radix6(temp, x);
+    radix6(temp + 6, x + 6);
+  } else {
+    float2 temp[12] = {
+        x[0],
+        x[2],
+        x[4],
+        x[6],
+        x[8],
+        x[10],
+        x[1],
+        x[3],
+        x[5],
+        x[7],
+        x[9],
+        x[11]};
+    radix6(temp, x);
+    radix6(temp + 6, x + 6);
+  }
+
+  y[0] = x[0] + x[6];
+  y[6] = x[0] - x[6];
+  for (int t = 1; t < 6; t++) {
+    float2 a = complex_mul(x[t + 6], w[t - 1]);
+    y[t] = x[t] + a;
+    y[t + 6] = x[t] - a;
+  }
+}
+
+METAL_FUNC void radix13(thread float2* x, thread float2* y) {
+  // Raders Algorithm
+  float2 inv = {1 / 12.0, -1 / 12.0};
+
+  // fft
+  radix12<true>(x + 1, y + 1);
+
+  y[0] = y[1] + x[0];
+
+  // b_q
+  y[1] = complex_mul_conj(y[1], float2(-1, 0));
+  y[2] = complex_mul_conj(y[2], float2(3.07497206, -1.88269669));
+  y[3] = complex_mul_conj(y[3], float2(3.09912468, 1.84266823));
+  y[4] = complex_mul_conj(y[4], float2(3.45084438, -1.04483161));
+  y[5] = complex_mul_conj(y[5], float2(0.91083583, 3.48860690));
+  y[6] = complex_mul_conj(y[6], float2(-3.60286363, 0.139189267));
+  y[7] = complex_mul_conj(y[7], float2(3.60555128, 0));
+  y[8] = complex_mul_conj(y[8], float2(3.60286363, 0.139189267));
+  y[9] = complex_mul_conj(y[9], float2(0.91083583, -3.48860690));
+  y[10] = complex_mul_conj(y[10], float2(-3.45084438, -1.04483161));
+  y[11] = complex_mul_conj(y[11], float2(3.09912468, -1.84266823));
+  y[12] = complex_mul_conj(y[12], float2(-3.07497206, -1.88269669));
+
+  // ifft
+  radix12<false>(y + 1, x + 1);
+
+  y[1] = x[1] * inv + x[0];
+  y[7] = x[2] * inv + x[0];
+  y[10] = x[3] * inv + x[0];
+  y[5] = x[4] * inv + x[0];
+  y[9] = x[5] * inv + x[0];
+  y[11] = x[6] * inv + x[0];
+  y[12] = x[7] * inv + x[0];
+  y[6] = x[8] * inv + x[0];
+  y[3] = x[9] * inv + x[0];
+  y[8] = x[10] * inv + x[0];
+  y[4] = x[11] * inv + x[0];
+  y[2] = x[12] * inv + x[0];
+}
--- a/mlx/backend/metal/kernels/fft/readwrite.h
+++ b/mlx/backend/metal/kernels/fft/readwrite.h
@@ -0,0 +1,622 @@
+// Copyright © 2024 Apple Inc.
+
+#include <metal_common>
+
+#include "mlx/backend/metal/kernels/fft/radix.h"
+
+/* FFT helpers for reading and writing from/to device memory.
+
+For many sizes, GPU FFTs are memory bandwidth bound so
+read/write performance is important.
+
+Where possible, we read 128 bits sequentially in each thread,
+coalesced with accesses from adajcent threads for optimal performance.
+
+We implement specialized reading/writing for:
+  - FFT
+  - RFFT
+  - IRFFT
+
+Each with support for:
+  - Contiguous reads
+  - Padded reads
+  - Strided reads
+*/
+
+#define MAX_RADIX 13
+
+using namespace metal;
+
+template <
+    typename in_T,
+    typename out_T,
+    int step = 0,
+    bool four_step_real = false>
+struct ReadWriter {
+  const device in_T* in;
+  threadgroup float2* buf;
+  device out_T* out;
+  int n;
+  int batch_size;
+  int elems_per_thread;
+  uint3 elem;
+  uint3 grid;
+  int threads_per_tg;
+  bool inv;
+
+  // Used for strided access
+  int strided_device_idx = 0;
+  int strided_shared_idx = 0;
+
+  METAL_FUNC ReadWriter(
+      const device in_T* in_,
+      threadgroup float2* buf_,
+      device out_T* out_,
+      const short n_,
+      const int batch_size_,
+      const short elems_per_thread_,
+      const uint3 elem_,
+      const uint3 grid_,
+      const bool inv_)
+      : in(in_),
+        buf(buf_),
+        out(out_),
+        n(n_),
+        batch_size(batch_size_),
+        elems_per_thread(elems_per_thread_),
+        elem(elem_),
+        grid(grid_),
+        inv(inv_) {
+    // Account for padding on last threadgroup
+    threads_per_tg = elem.x == grid.x - 1
+        ? (batch_size - (grid.x - 1) * grid.y) * grid.z
+        : grid.y * grid.z;
+  }
+
+  // ifft(x) = 1/n * conj(fft(conj(x)))
+  METAL_FUNC float2 post_in(float2 elem) const {
+    return inv ? float2(elem.x, -elem.y) : elem;
+  }
+
+  // Handle float case for generic RFFT alg
+  METAL_FUNC float2 post_in(float elem) const {
+    return float2(elem, 0);
+  }
+
+  METAL_FUNC float2 pre_out(float2 elem) const {
+    return inv ? float2(elem.x / n, -elem.y / n) : elem;
+  }
+
+  METAL_FUNC float2 pre_out(float2 elem, int length) const {
+    return inv ? float2(elem.x / length, -elem.y / length) : elem;
+  }
+
+  METAL_FUNC bool out_of_bounds() const {
+    // Account for possible extra threadgroups
+    int grid_index = elem.x * grid.y + elem.y;
+    return grid_index >= batch_size;
+  }
+
+  METAL_FUNC void load() const {
+    int batch_idx = elem.x * grid.y * n;
+    short tg_idx = elem.y * grid.z + elem.z;
+    short max_index = grid.y * n - 2;
+
+    // 2 complex64s = 128 bits
+    constexpr int read_width = 2;
+    for (short e = 0; e < (elems_per_thread / read_width); e++) {
+      short index = read_width * tg_idx + read_width * threads_per_tg * e;
+      index = metal::min(index, max_index);
+      // vectorized reads
+      buf[index] = post_in(in[batch_idx + index]);
+      buf[index + 1] = post_in(in[batch_idx + index + 1]);
+    }
+    max_index += 1;
+    if (elems_per_thread % 2 != 0) {
+      short index = tg_idx +
+          read_width * threads_per_tg * (elems_per_thread / read_width);
+      index = metal::min(index, max_index);
+      buf[index] = post_in(in[batch_idx + index]);
+    }
+  }
+
+  METAL_FUNC void write() const {
+    int batch_idx = elem.x * grid.y * n;
+    short tg_idx = elem.y * grid.z + elem.z;
+    short max_index = grid.y * n - 2;
+
+    constexpr int read_width = 2;
+    for (short e = 0; e < (elems_per_thread / read_width); e++) {
+      short index = read_width * tg_idx + read_width * threads_per_tg * e;
+      index = metal::min(index, max_index);
+      // vectorized reads
+      out[batch_idx + index] = pre_out(buf[index]);
+      out[batch_idx + index + 1] = pre_out(buf[index + 1]);
+    }
+    max_index += 1;
+    if (elems_per_thread % 2 != 0) {
+      short index = tg_idx +
+          read_width * threads_per_tg * (elems_per_thread / read_width);
+      index = metal::min(index, max_index);
+      out[batch_idx + index] = pre_out(buf[index]);
+    }
+  }
+
+  // Padded IO for Bluestein's algorithm
+  METAL_FUNC void load_padded(int length, const device float2* w_k) const {
+    int batch_idx = elem.x * grid.y * length + elem.y * length;
+    int fft_idx = elem.z;
+    int m = grid.z;
+
+    threadgroup float2* seq_buf = buf + elem.y * n;
+    for (int e = 0; e < elems_per_thread; e++) {
+      int index = metal::min(fft_idx + e * m, n - 1);
+      if (index < length) {
+        float2 elem = post_in(in[batch_idx + index]);
+        seq_buf[index] = complex_mul(elem, w_k[index]);
+      } else {
+        seq_buf[index] = 0.0;
+      }
+    }
+  }
+
+  METAL_FUNC void write_padded(int length, const device float2* w_k) const {
+    int batch_idx = elem.x * grid.y * length + elem.y * length;
+    int fft_idx = elem.z;
+    int m = grid.z;
+    float2 inv_factor = {1.0f / n, -1.0f / n};
+
+    threadgroup float2* seq_buf = buf + elem.y * n;
+    for (int e = 0; e < elems_per_thread; e++) {
+      int index = metal::min(fft_idx + e * m, n - 1);
+      if (index < length) {
+        float2 elem = seq_buf[index + length - 1] * inv_factor;
+        out[batch_idx + index] = pre_out(complex_mul(elem, w_k[index]), length);
+      }
+    }
+  }
+
+  // Strided IO for four step FFT
+  METAL_FUNC void compute_strided_indices(int stride, int overall_n) {
+    // Use the batch threadgroup dimension to coalesce memory accesses:
+    // e.g. stride = 12
+    // device      | shared mem
+    // 0  1  2  3  |  0 12 - -
+    // -  -  -  -  |  1 13 - -
+    // -  -  -  -  |  2 14 - -
+    // 12 13 14 15 |  3 15 - -
+    int coalesce_width = grid.y;
+    int tg_idx = elem.y * grid.z + elem.z;
+    int outer_batch_size = stride / coalesce_width;
+
+    int strided_batch_idx = (elem.x % outer_batch_size) * coalesce_width +
+        overall_n * (elem.x / outer_batch_size);
+    strided_device_idx = strided_batch_idx +
+        tg_idx / coalesce_width * elems_per_thread * stride +
+        tg_idx % coalesce_width;
+    strided_shared_idx = (tg_idx % coalesce_width) * n +
+        tg_idx / coalesce_width * elems_per_thread;
+  }
+
+  // Four Step FFT First Step
+  METAL_FUNC void load_strided(int stride, int overall_n) {
+    compute_strided_indices(stride, overall_n);
+    for (int e = 0; e < elems_per_thread; e++) {
+      buf[strided_shared_idx + e] =
+          post_in(in[strided_device_idx + e * stride]);
+    }
+  }
+
+  METAL_FUNC void write_strided(int stride, int overall_n) {
+    for (int e = 0; e < elems_per_thread; e++) {
+      float2 output = buf[strided_shared_idx + e];
+      int combined_idx = (strided_device_idx + e * stride) % overall_n;
+      int ij = (combined_idx / stride) * (combined_idx % stride);
+      // Apply four step twiddles at end of first step
+      float2 twiddle = get_twiddle(ij, overall_n);
+      out[strided_device_idx + e * stride] = complex_mul(output, twiddle);
+    }
+  }
+};
+
+// Four Step FFT Second Step
+template <>
+METAL_FUNC void ReadWriter<float2, float2, /*step=*/1>::load_strided(
+    int stride,
+    int overall_n) {
+  // Silence compiler warnings
+  (void)stride;
+  (void)overall_n;
+  // Don't invert between steps
+  bool default_inv = inv;
+  inv = false;
+  load();
+  inv = default_inv;
+}
+
+template <>
+METAL_FUNC void ReadWriter<float2, float2, /*step=*/1>::write_strided(
+    int stride,
+    int overall_n) {
+  compute_strided_indices(stride, overall_n);
+  for (int e = 0; e < elems_per_thread; e++) {
+    float2 output = buf[strided_shared_idx + e];
+    out[strided_device_idx + e * stride] = pre_out(output, overall_n);
+  }
+}
+
+// For RFFT, we interleave batches of two real sequences into one complex one:
+//
+// z_k = x_k + j.y_k
+// X_k = (Z_k + Z_(N-k)*) / 2
+// Y_k = -j * ((Z_k - Z_(N-k)*) / 2)
+//
+// This roughly doubles the throughput over the regular FFT.
+template <>
+METAL_FUNC bool ReadWriter<float, float2>::out_of_bounds() const {
+  int grid_index = elem.x * grid.y + elem.y;
+  // We pack two sequences into one for RFFTs
+  return grid_index * 2 >= batch_size;
+}
+
+template <>
+METAL_FUNC void ReadWriter<float, float2>::load() const {
+  int batch_idx = elem.x * grid.y * n * 2 + elem.y * n * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  // No out of bounds accesses on odd batch sizes
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_in =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  for (int e = 0; e < elems_per_thread; e++) {
+    int index = metal::min(fft_idx + e * m, n - 1);
+    seq_buf[index].x = in[batch_idx + index];
+    seq_buf[index].y = in[batch_idx + index + next_in];
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float, float2>::write() const {
+  short n_over_2 = (n / 2) + 1;
+
+  int batch_idx = elem.x * grid.y * n_over_2 * 2 + elem.y * n_over_2 * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_out =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n_over_2;
+
+  float2 conj = {1, -1};
+  float2 minus_j = {0, -1};
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  for (int e = 0; e < elems_per_thread / 2 + 1; e++) {
+    int index = metal::min(fft_idx + e * m, n_over_2 - 1);
+    // x_0 = z_0.real
+    // y_0 = z_0.imag
+    if (index == 0) {
+      out[batch_idx + index] = {seq_buf[index].x, 0};
+      out[batch_idx + index + next_out] = {seq_buf[index].y, 0};
+    } else {
+      float2 x_k = seq_buf[index];
+      float2 x_n_minus_k = seq_buf[n - index] * conj;
+      out[batch_idx + index] = (x_k + x_n_minus_k) / 2;
+      out[batch_idx + index + next_out] =
+          complex_mul(((x_k - x_n_minus_k) / 2), minus_j);
+    }
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float, float2>::load_padded(
+    int length,
+    const device float2* w_k) const {
+  int batch_idx = elem.x * grid.y * length * 2 + elem.y * length * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  // No out of bounds accesses on odd batch sizes
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_in =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : length;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  for (int e = 0; e < elems_per_thread; e++) {
+    int index = metal::min(fft_idx + e * m, n - 1);
+    if (index < length) {
+      float2 elem =
+          float2(in[batch_idx + index], in[batch_idx + index + next_in]);
+      seq_buf[index] = complex_mul(elem, w_k[index]);
+    } else {
+      seq_buf[index] = 0;
+    }
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float, float2>::write_padded(
+    int length,
+    const device float2* w_k) const {
+  int length_over_2 = (length / 2) + 1;
+  int batch_idx =
+      elem.x * grid.y * length_over_2 * 2 + elem.y * length_over_2 * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n + length - 1;
+
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_out = batch_size % 2 == 1 && grid_index * 2 == batch_size - 1
+      ? 0
+      : length_over_2;
+
+  float2 conj = {1, -1};
+  float2 inv_factor = {1.0f / n, -1.0f / n};
+  float2 minus_j = {0, -1};
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  for (int e = 0; e < elems_per_thread / 2 + 1; e++) {
+    int index = metal::min(fft_idx + e * m, length_over_2 - 1);
+    // x_0 = z_0.real
+    // y_0 = z_0.imag
+    if (index == 0) {
+      float2 elem = complex_mul(w_k[index], seq_buf[index] * inv_factor);
+      out[batch_idx + index] = float2(elem.x, 0);
+      out[batch_idx + index + next_out] = float2(elem.y, 0);
+    } else {
+      float2 x_k = complex_mul(w_k[index], seq_buf[index] * inv_factor);
+      float2 x_n_minus_k = complex_mul(
+          w_k[length - index], seq_buf[length - index] * inv_factor);
+      x_n_minus_k *= conj;
+      // w_k should happen before this extraction
+      out[batch_idx + index] = (x_k + x_n_minus_k) / 2;
+      out[batch_idx + index + next_out] =
+          complex_mul(((x_k - x_n_minus_k) / 2), minus_j);
+    }
+  }
+}
+
+// For IRFFT, we do the opposite
+//
+// Z_k = X_k + j.Y_k
+// x_k = Re(Z_k)
+// Y_k = Imag(Z_k)
+template <>
+METAL_FUNC bool ReadWriter<float2, float>::out_of_bounds() const {
+  int grid_index = elem.x * grid.y + elem.y;
+  // We pack two sequences into one for IRFFTs
+  return grid_index * 2 >= batch_size;
+}
+
+template <>
+METAL_FUNC void ReadWriter<float2, float>::load() const {
+  short n_over_2 = (n / 2) + 1;
+  int batch_idx = elem.x * grid.y * n_over_2 * 2 + elem.y * n_over_2 * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  // No out of bounds accesses on odd batch sizes
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_in =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n_over_2;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  float2 conj = {1, -1};
+  float2 plus_j = {0, 1};
+
+  for (int t = 0; t < elems_per_thread / 2 + 1; t++) {
+    int index = metal::min(fft_idx + t * m, n_over_2 - 1);
+    float2 x = in[batch_idx + index];
+    float2 y = in[batch_idx + index + next_in];
+    // NumPy forces first input to be real
+    bool first_val = index == 0;
+    // NumPy forces last input on even irffts to be real
+    bool last_val = n % 2 == 0 && index == n_over_2 - 1;
+    if (first_val || last_val) {
+      x = float2(x.x, 0);
+      y = float2(y.x, 0);
+    }
+    seq_buf[index] = x + complex_mul(y, plus_j);
+    seq_buf[index].y = -seq_buf[index].y;
+    if (index > 0 && !last_val) {
+      seq_buf[n - index] = (x * conj) + complex_mul(y * conj, plus_j);
+      seq_buf[n - index].y = -seq_buf[n - index].y;
+    }
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float2, float>::write() const {
+  int batch_idx = elem.x * grid.y * n * 2 + elem.y * n * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_out =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  for (int e = 0; e < elems_per_thread; e++) {
+    int index = metal::min(fft_idx + e * m, n - 1);
+    out[batch_idx + index] = seq_buf[index].x / n;
+    out[batch_idx + index + next_out] = seq_buf[index].y / -n;
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float2, float>::load_padded(
+    int length,
+    const device float2* w_k) const {
+  int n_over_2 = (n / 2) + 1;
+  int length_over_2 = (length / 2) + 1;
+
+  int batch_idx =
+      elem.x * grid.y * length_over_2 * 2 + elem.y * length_over_2 * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n;
+
+  // No out of bounds accesses on odd batch sizes
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_in = batch_size % 2 == 1 && grid_index * 2 == batch_size - 1
+      ? 0
+      : length_over_2;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  float2 conj = {1, -1};
+  float2 plus_j = {0, 1};
+
+  for (int t = 0; t < elems_per_thread / 2 + 1; t++) {
+    int index = metal::min(fft_idx + t * m, n_over_2 - 1);
+    float2 x = in[batch_idx + index];
+    float2 y = in[batch_idx + index + next_in];
+    if (index < length_over_2) {
+      bool last_val = length % 2 == 0 && index == length_over_2 - 1;
+      if (last_val) {
+        x = float2(x.x, 0);
+        y = float2(y.x, 0);
+      }
+      float2 elem1 = x + complex_mul(y, plus_j);
+      seq_buf[index] = complex_mul(elem1 * conj, w_k[index]);
+      if (index > 0 && !last_val) {
+        float2 elem2 = (x * conj) + complex_mul(y * conj, plus_j);
+        seq_buf[length - index] =
+            complex_mul(elem2 * conj, w_k[length - index]);
+      }
+    } else {
+      short pad_index = metal::min(length + (index - length_over_2) * 2, n - 2);
+      seq_buf[pad_index] = 0;
+      seq_buf[pad_index + 1] = 0;
+    }
+  }
+}
+
+template <>
+METAL_FUNC void ReadWriter<float2, float>::write_padded(
+    int length,
+    const device float2* w_k) const {
+  int batch_idx = elem.x * grid.y * length * 2 + elem.y * length * 2;
+  threadgroup float2* seq_buf = buf + elem.y * n + length - 1;
+
+  int grid_index = elem.x * grid.y + elem.y;
+  short next_out =
+      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : length;
+
+  short m = grid.z;
+  short fft_idx = elem.z;
+
+  float2 inv_factor = {1.0f / n, -1.0f / n};
+  for (int e = 0; e < elems_per_thread; e++) {
+    int index = fft_idx + e * m;
+    if (index < length) {
+      float2 output = complex_mul(seq_buf[index] * inv_factor, w_k[index]);
+      out[batch_idx + index] = output.x / length;
+      out[batch_idx + index + next_out] = output.y / -length;
+    }
+  }
+}
+
+// Four Step RFFT
+template <>
+METAL_FUNC void
+ReadWriter<float2, float2, /*step=*/1, /*real=*/true>::load_strided(
+    int stride,
+    int overall_n) {
+  // Silence compiler warnings
+  (void)stride;
+  (void)overall_n;
+  // Don't invert between steps
+  bool default_inv = inv;
+  inv = false;
+  load();
+  inv = default_inv;
+}
+
+template <>
+METAL_FUNC void
+ReadWriter<float2, float2, /*step=*/1, /*real=*/true>::write_strided(
+    int stride,
+    int overall_n) {
+  int overall_n_over_2 = overall_n / 2 + 1;
+  int coalesce_width = grid.y;
+  int tg_idx = elem.y * grid.z + elem.z;
+  int outer_batch_size = stride / coalesce_width;
+
+  int strided_batch_idx = (elem.x % outer_batch_size) * coalesce_width +
+      overall_n_over_2 * (elem.x / outer_batch_size);
+  strided_device_idx = strided_batch_idx +
+      tg_idx / coalesce_width * elems_per_thread / 2 * stride +
+      tg_idx % coalesce_width;
+  strided_shared_idx = (tg_idx % coalesce_width) * n +
+      tg_idx / coalesce_width * elems_per_thread / 2;
+  for (int e = 0; e < elems_per_thread / 2; e++) {
+    float2 output = buf[strided_shared_idx + e];
+    out[strided_device_idx + e * stride] = output;
+  }
+
+  // Add on n/2 + 1 element
+  if (tg_idx == 0 && elem.x % outer_batch_size == 0) {
+    out[strided_batch_idx + overall_n / 2] = buf[n / 2];
+  }
+}
+
+// Four Step IRFFT
+template <>
+METAL_FUNC void
+ReadWriter<float2, float2, /*step=*/0, /*real=*/true>::load_strided(
+    int stride,
+    int overall_n) {
+  int overall_n_over_2 = overall_n / 2 + 1;
+  auto conj = float2(1, -1);
+
+  compute_strided_indices(stride, overall_n);
+  // Translate indices in terms of N - k
+  for (int e = 0; e < elems_per_thread; e++) {
+    int device_idx = strided_device_idx + e * stride;
+    int overall_batch = device_idx / overall_n;
+    int overall_index = device_idx % overall_n;
+    if (overall_index < overall_n_over_2) {
+      device_idx -= overall_batch * (overall_n - overall_n_over_2);
+      buf[strided_shared_idx + e] = in[device_idx] * conj;
+    } else {
+      int conj_idx = overall_n - overall_index;
+      device_idx = overall_batch * overall_n_over_2 + conj_idx;
+      buf[strided_shared_idx + e] = in[device_idx];
+    }
+  }
+}
+
+template <>
+METAL_FUNC void
+ReadWriter<float2, float, /*step=*/1, /*real=*/true>::load_strided(
+    int stride,
+    int overall_n) {
+  // Silence compiler warnings
+  (void)stride;
+  (void)overall_n;
+  bool default_inv = inv;
+  inv = false;
+  load();
+  inv = default_inv;
+}
+
+template <>
+METAL_FUNC void
+ReadWriter<float2, float, /*step=*/1, /*real=*/true>::write_strided(
+    int stride,
+    int overall_n) {
+  compute_strided_indices(stride, overall_n);
+
+  for (int e = 0; e < elems_per_thread; e++) {
+    out[strided_device_idx + e * stride] =
+        pre_out(buf[strided_shared_idx + e], overall_n).x;
+  }
+}
--- a/mlx/backend/metal/kernels/gemv.metal
+++ b/mlx/backend/metal/kernels/gemv.metal
@@ -17,29 +17,250 @@ using namespace metal;

 #define MLX_MTL_CONST static constant constexpr const

-MLX_MTL_CONST int SIMD_SIZE = 32;
-
 template <
    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
 struct GEMVKernel {
-  static_assert(BN == SIMD_SIZE, "gemv block must have a width of SIMD_SIZE");
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;

-  // - The matrix of size (M = out_vec_size, N = in_vec_size) is divided up
-  //   into blocks of (BM * TM, BN * TN) divided among threadgroups
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+
+  static_assert(
+      SN == 8 || SN == 16 || SN == 32,
+      "gemv block must have a width of 8, 16, or 32");
+
+  // - The matrix of size (M = out_vec_size, K = in_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
  // - Every thread works on a block of (TM, TN)
-  // - We assume each thead group is launched with (BN, BM, 1) threads
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
  //
-  // 1. A thread loads TN elements each from mat along TM contiguous rows
+  // 1. A thread loads TN elements each from mat along TM rows
  //    and the corresponding scalar from the vector
  // 2. The thread then multiplies and adds to accumulate its local result for
  //    the block
  // 3. At the end, each thread has accumulated results over all blocks across
  //    the rows. These are then summed up across the threadgroup
+  // 4. Each threadgroup writes its accumulated blockM outputs
+  //
+  // Edge case handling:
+  // - The threadgroup with the largest tid has blocks that exceed the matrix
+  //   * The blocks that start outside the matrix are never read (thread results
+  //     remain zero)
+  //   * The last thread that partially overlaps with the matrix is shifted
+  //     inwards such that the thread block fits exactly in the matrix
+
+  MLX_MTL_CONST short tgp_mem_size = BN > 1 ? BN*(blockM + TM) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BN > 1;
+
+  static METAL_FUNC void
+  load_unsafe(const device T* src, thread T dst[TN], const int src_offset = 0) {
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tn = 0; tn < TN; tn++) {
+      dst[tn] = src[src_offset + tn];
+    }
+  }
+
+  static METAL_FUNC void load_safe(
+      const device T* src,
+      thread T dst[TN],
+      const int src_offset = 0,
+      const int src_size = TN) {
+    if (src_offset + TN <= src_size) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = src[src_offset + tn];
+      }
+    } else { // Edgecase
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = src_offset + tn < src_size ? src[src_offset + tn] : 0;
+      }
+    }
+  }
+
+  static METAL_FUNC void run(
+      const device T* mat [[buffer(0)]],
+      const device T* in_vec [[buffer(1)]],
+      const device T* bias [[buffer(2)]],
+      device T* out_vec [[buffer(3)]],
+      const constant int& in_vec_size [[buffer(4)]],
+      const constant int& out_vec_size [[buffer(5)]],
+      const constant int& matrix_ld [[buffer(6)]],
+      const constant float& alpha [[buffer(7)]],
+      const constant float& beta [[buffer(8)]],
+      const constant int& bias_stride [[buffer(14)]],
+      threadgroup T* tgp_memory [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint simd_gid [[simdgroup_index_in_threadgroup]],
+      uint simd_lid [[thread_index_in_simdgroup]]) {
+    // Appease compiler
+    (void)lid;
+
+    // Thread local accumulation results
+    thread T result[TM] = {0};
+    thread T inter[TN];
+    thread T v_coeff[TN];
+
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);
+
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+
+    const int simdM = BN != 1 ? SM * (simd_gid / BN) : int(SM * simd_gid);
+    const int simdN = BN != 1 ? SN * (simd_gid % BN) : 0;
+
+    int bm = (simdM + thrM) * TM;
+    int bn = (simdN + thrN) * TN;
+
+    // Block position
+    int out_row = tid.x * blockM + bm;
+
+    // Exit simdgroup if rows out of bound
+    if (out_row >= out_vec_size)
+      return;
+
+    // Adjust tail simdgroup to ensure in bound reads
+    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;
+
+    // Advance matrix
+    mat += out_row * matrix_ld;
+
+    constexpr const uniform<int> loop_stride = make_uniform(blockN);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;
+
+    // Loop over in_vec in blocks of blockN
+    for (int i = 0; i < n_iter; ++i) {
+      load_unsafe(in_vec, v_coeff, bn);
+
+      // Per thread work loop
+      int mat_offset = 0;
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        // Load for the row
+        load_unsafe(mat, inter, mat_offset + bn);
+
+        // Accumulate results
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          result[tm] += inter[tn] * v_coeff[tn];
+        }
+
+        mat_offset += matrix_ld;
+      }
+
+      bn += blockN;
+    }
+
+    if (leftover > 0) {
+      load_safe(in_vec, v_coeff, bn, in_size);
+
+      // Per thread work loop
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        // Load for the row
+        load_safe(&mat[tm * matrix_ld], inter, bn, in_size);
+
+        // Accumulate results
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          result[tm] += inter[tn] * v_coeff[tn];
+        }
+      }
+    }
+
+    // Simdgroup accumulations
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tm = 0; tm < TM; tm++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sn = (SN / 2); sn >= 1; sn >>= 1) {
+        result[tm] += simd_shuffle_down(result[tm], sn);
+      }
+    }
+
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup T* tgp_results = tgp_memory + sgN * (blockM + TM) + bm;
+      if (thrN == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          tgp_results[tm] = result[tm];
+        }
+
+        threadgroup_barrier(mem_flags::mem_none);
+
+        if (sgN == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgn = 1; sgn < BN; sgn++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tm = 0; tm < TM; tm++) {
+              result[tm] += tgp_results[sgn * (blockM + TM) + tm];
+            }
+          }
+        }
+      }
+    }
+
+    // Write outputs
+    if (simdN == 0 && thrN == 0) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        if (kDoAxpby) {
+          out_vec[out_row + tm] = static_cast<T>(alpha) * result[tm] +
+              static_cast<T>(beta) * bias[(out_row + tm) * bias_stride];
+        } else {
+          out_vec[out_row + tm] = result[tm];
+        }
+      }
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/// Vector matrix multiplication
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename T,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
+struct GEMVTKernel {
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;
+
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+
+  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
+  // - Every thread works on a block of (TM, TN)
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
+  //
+  // 1. A thread loads TN elements each from mat along TM contiguous rows
+  //    and the corresponding scalar from the vector
+  // 2. The thread then accumulates its local result for the block
+  // 3. At the end, each thread has accumulated results over all blocks across
+  //    the rows. These are then summed up across the threadgroup
  // 4. Each threadgroup writes its accumulated BN * TN outputs
  //
  // Edge case handling:
@@ -49,7 +270,8 @@ struct GEMVKernel {
  //   * The last thread that partially overlaps with the matrix is shifted
  //     inwards such that the thread block fits exactly in the matrix

-  MLX_MTL_CONST short tgp_mem_size = BN * TN * 2;
+  MLX_MTL_CONST short tgp_mem_size = BM > 1 ? BM*(blockN + TN) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BM > 1;

  static METAL_FUNC void run(
      const device T* mat [[buffer(0)]],
@@ -70,230 +292,113 @@ struct GEMVKernel {
    // Appease compiler
    (void)lid;

-    // Threadgroup in_vec cache
-    threadgroup T* in_vec_block = tgp_memory + simd_lid * TN * 2;
-
-    // Thread local accumulation results
-    thread T result[TM] = {0};
-    thread T inter[TN];
-    thread T v_coeff[TN];
-
-    // Block position
-    int out_row = (tid.x * BM + simd_gid) * TM;
-
-    // Exit simdgroup if rows out of bound
-    if (out_row >= out_vec_size)
-      return;
-
-    // Adjust tail simdgroup to ensure in bound reads
-    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;
-
-    // Advance matrix
-    mat += out_row * marix_ld;
-
-    // Loop over in_vec in blocks of BN * TN
-    for (int bn = simd_lid * TN; bn < in_vec_size; bn += BN * TN) {
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      // Prefetch in_vector for threadgroup use
-      if (simd_gid == 0) {
-        // Main load loop
-        if (bn + TN <= in_vec_size) {
-          MLX_MTL_PRAGMA_UNROLL
-          for (int tn = 0; tn < TN; tn++) {
-            in_vec_block[tn] = in_vec[bn + tn];
-          }
-
-        } else { // Edgecase
-
-          MLX_MTL_PRAGMA_UNROLL
-          for (int tn = 0; tn < TN; tn++) {
-            in_vec_block[tn] = bn + tn < in_vec_size ? in_vec[bn + tn] : 0;
-          }
-        }
-      }
-
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      // Load for all rows
-      MLX_MTL_PRAGMA_UNROLL
-      for (int tn = 0; tn < TN; tn++) {
-        v_coeff[tn] = in_vec_block[tn];
-      }
-
-      // Per thread work loop
-      MLX_MTL_PRAGMA_UNROLL
-      for (int tm = 0; tm < TM; tm++) {
-        // Load for the row
-        if (bn + TN <= in_vec_size) {
-          MLX_MTL_PRAGMA_UNROLL
-          for (int tn = 0; tn < TN; tn++) {
-            inter[tn] = mat[tm * marix_ld + bn + tn];
-          }
-
-        } else { // Edgecase
-          MLX_MTL_PRAGMA_UNROLL
-          for (int tn = 0; tn < TN; tn++) {
-            int col_idx =
-                (bn + tn) < in_vec_size ? (bn + tn) : (in_vec_size - 1);
-            inter[tn] = mat[tm * marix_ld + col_idx];
-          }
-        }
-
-        // Accumulate results
-        MLX_MTL_PRAGMA_UNROLL
-        for (int tn = 0; tn < TN; tn++) {
-          result[tm] += inter[tn] * v_coeff[tn];
-        }
-      }
-    }
-
-    // Simdgroup accumulations
-    MLX_MTL_PRAGMA_UNROLL
-    for (int tm = 0; tm < TM; tm++) {
-      result[tm] = simd_sum(result[tm]);
-    }
-
-    // Write outputs
-    if (simd_lid == 0) {
-      MLX_MTL_PRAGMA_UNROLL
-      for (int tm = 0; tm < TM; tm++) {
-        if (kDoAxpby) {
-          out_vec[out_row + tm] = static_cast<T>(alpha) * result[tm] +
-              static_cast<T>(beta) * bias[(out_row + tm) * bias_stride];
-        } else {
-          out_vec[out_row + tm] = result[tm];
-        }
-      }
-    }
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-/// Vector matrix multiplication
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
-    const int TM, /* Thread rows (in elements) */
-    const int TN, /* Thread cols (in elements) */
-    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
-struct GEMVTKernel {
-  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
-  //   into blocks of (BM * TM, BN * TN) divided among threadgroups
-  // - Every thread works on a block of (TM, TN)
-  // - We assume each thead group is launched with (BN, BM, 1) threads
-  //
-  // 1. A thread loads TN elements each from mat along TM contiguous rows
-  //    and the corresponding scalar from the vector
-  // 2. The thread then accumulates its local result for the block
-  // 3. At the end, each thread has accumulated results over all blocks across
-  //    the rows. These are then summed up across the threadgroup
-  // 4. Each threadgroup writes its accumulated BN * TN outputs
-  //
-  // Edge case handling:
-  // - The threadgroup with the largest tid has blocks that exceed the matrix
-  //   * The blocks that start outside the matrix are never read (thread results
-  //     remain zero)
-  //   * The last thread that partially overlaps with the matrix is shifted
-  //     inwards such that the thread block fits exactly in the matrix
-
-  MLX_MTL_CONST short tgp_mem_size = BN * BM * TN;
-
-  static METAL_FUNC void run(
-      const device T* mat [[buffer(0)]],
-      const device T* in_vec [[buffer(1)]],
-      const device T* bias [[buffer(2)]],
-      device T* out_vec [[buffer(3)]],
-      const constant int& in_vec_size [[buffer(4)]],
-      const constant int& out_vec_size [[buffer(5)]],
-      const constant int& marix_ld [[buffer(6)]],
-      const constant float& alpha [[buffer(7)]],
-      const constant float& beta [[buffer(8)]],
-      const constant int& bias_stride [[buffer(14)]],
-      threadgroup T* tgp_memory [[threadgroup(0)]],
-      uint3 tid [[threadgroup_position_in_grid]],
-      uint3 lid [[thread_position_in_threadgroup]],
-      uint simd_gid [[simdgroup_index_in_threadgroup]],
-      uint simd_lid [[thread_index_in_simdgroup]]) {
-    // Appease compiler
-    (void)simd_gid;
-    (void)simd_lid;
-
    // Thread local accumulation results
    T result[TN] = {0};
    T inter[TN];
    T v_coeff[TM];

-    // Threadgroup accumulation results
-    threadgroup T* tgp_results = tgp_memory + lid.x * BM * TN;
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);

-    int out_col = (tid.x * BN + lid.x) * TN;
-    int in_row = lid.y * TM;
+    const int sgM = BN != 1 ? (simd_gid / BN) : int(simd_gid);
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+
+    const int simdM = SM * sgM;
+    const int simdN = SN * sgN;
+
+    int cm = (simdM + thrM);
+    int cn = (simdN + thrN);
+
+    int bm = cm * TM;
+    int bn = cn * TN;
+
+    int out_col = tid.x * blockN + bn;
+
+    constexpr const uniform<int> loop_stride = make_uniform(blockM);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;

    // Edgecase handling
    if (out_col < out_vec_size) {
      out_col = out_col + TN < out_vec_size ? out_col : out_vec_size - TN;

      // Per thread accumulation main loop
-      int bm = in_row;
-      for (; bm < in_vec_size; bm += BM * TM) {
+      for (int i = 0; i < n_iter; ++i) {
        // Adding a threadgroup_barrier improves performance slightly
        // This is possibly it may help exploit cache better
        threadgroup_barrier(mem_flags::mem_none);

-        if (bm + TM <= in_vec_size) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          v_coeff[tm] = in_vec[bm + tm];
+        }
+
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          for (int tn = 0; tn < TN; tn++) {
+            inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
+          }
+          for (int tn = 0; tn < TN; tn++) {
+            result[tn] += v_coeff[tm] * inter[tn];
+          }
+        }
+
+        bm += blockM;
+      }
+
+      if (leftover > 0) {
+        for (int tm = 0; tm < TM && bm + tm < in_vec_size; tm++) {
+          v_coeff[tm] = in_vec[bm + tm];
+
          MLX_MTL_PRAGMA_UNROLL
-          for (int tm = 0; tm < TM; tm++) {
-            v_coeff[tm] = in_vec[bm + tm];
+          for (int tn = 0; tn < TN; tn++) {
+            inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
          }

          MLX_MTL_PRAGMA_UNROLL
-          for (int tm = 0; tm < TM; tm++) {
-            for (int tn = 0; tn < TN; tn++) {
-              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
-            }
-            for (int tn = 0; tn < TN; tn++) {
-              result[tn] += v_coeff[tm] * inter[tn];
-            }
-          }
-
-        } else { // Edgecase handling
-          for (int tm = 0; bm + tm < in_vec_size; tm++) {
-            v_coeff[tm] = in_vec[bm + tm];
-
-            for (int tn = 0; tn < TN; tn++) {
-              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
-            }
-            for (int tn = 0; tn < TN; tn++) {
-              result[tn] += v_coeff[tm] * inter[tn];
-            }
+          for (int tn = 0; tn < TN; tn++) {
+            result[tn] += v_coeff[tm] * inter[tn];
          }
        }
      }
    }

-    // Threadgroup collection
-
+    // Simdgroup accumulations
    MLX_MTL_PRAGMA_UNROLL
-    for (int i = 0; i < TN; i++) {
-      tgp_results[lid.y * TN + i] = result[i];
+    for (int tn = 0; tn < TN; tn++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sm = (SM / 2); sm >= 1; sm >>= 1) {
+        result[tn] += simd_shuffle_down(result[tn], SN * sm);
+      }
    }

-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup T* tgp_results = tgp_memory + sgM * (blockN + TN) + bn;
+      if (thrM == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          tgp_results[tn] = result[tn];
+        }
+
+        threadgroup_barrier(mem_flags::mem_none);
+
+        if (sgM == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgm = 1; sgm < BM; sgm++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += tgp_results[sgm * (blockN + TN) + tn];
+            }
+          }
+        }
+      }
+    }

    // Threadgroup accumulation and writing out results
-    if (lid.y == 0 && out_col < out_vec_size) {
-      MLX_MTL_PRAGMA_UNROLL
-      for (int i = 1; i < BM; i++) {
-        MLX_MTL_PRAGMA_UNROLL
-        for (int j = 0; j < TN; j++) {
-          result[j] += tgp_results[i * TN + j];
-        }
-      }
-
+    if (cm == 0 && out_col < out_vec_size) {
      MLX_MTL_PRAGMA_UNROLL
      for (int j = 0; j < TN; j++) {
        if (kDoAxpby) {
@@ -313,13 +418,15 @@ struct GEMVTKernel {

 template <
    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch, /* Batch ndim > 1 */
    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
-[[kernel, max_total_threads_per_threadgroup(BM* BN)]] void gemv(
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
@@ -339,8 +446,9 @@ template <
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
-  using gemv_kernel = GEMVKernel<T, BM, BN, TM, TN, kDoAxpby>;
-  threadgroup T tgp_memory[gemv_kernel::tgp_mem_size];
+  using gemv_kernel = GEMVKernel<T, BM, BN, SM, SN, TM, TN, kDoAxpby>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  // Update batch offsets
  if (kDoNCBatch) {
@@ -373,17 +481,19 @@ template <
      alpha,
      beta,
      bias_stride,
-      tgp_memory,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
 }

-#define instantiate_gemv_helper(name, itype, bm, bn, tm, tn, nc, axpby)      \
-  template [[host_name("gemv_" #name "_bm" #bm "_bn" #bn "_tm" #tm "_tn" #tn \
-                       "_nc" #nc "_axpby" #axpby)]] [[kernel]] void          \
-  gemv<itype, bm, bn, tm, tn, nc, axpby>(                                    \
+#define instantiate_gemv_helper(                                             \
+    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                          \
+  template [[host_name("gemv_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
+                       "_tm" #tm "_tn" #tn "_nc" #nc                         \
+                       "_axpby" #axpby)]] [[kernel]] void                    \
+  gemv<itype, bm, bn, sm, sn, tm, tn, nc, axpby>(                            \
      const device itype* mat [[buffer(0)]],                                 \
      const device itype* in_vec [[buffer(1)]],                              \
      const device itype* bias [[buffer(2)]],                                \
@@ -405,11 +515,11 @@ template <
      uint simd_lid [[thread_index_in_simdgroup]]);

 // clang-format off
-#define instantiate_gemv(name, itype, bm, bn, tm, tn)        \
-  instantiate_gemv_helper(name, itype, bm, bn, tm, tn, 0, 0) \
-  instantiate_gemv_helper(name, itype, bm, bn, tm, tn, 0, 1) \
-  instantiate_gemv_helper(name, itype, bm, bn, tm, tn, 1, 0) \
-  instantiate_gemv_helper(name, itype, bm, bn, tm, tn, 1, 1) // clang-format on
+#define instantiate_gemv(name, itype, bm, bn, tm, tn)              \
+  instantiate_gemv_helper(name, itype, bm, 1, 1, bn, tm, tn, 0, 0) \
+  instantiate_gemv_helper(name, itype, bm, 1, 1, bn, tm, tn, 0, 1) \
+  instantiate_gemv_helper(name, itype, bm, 1, 1, bn, tm, tn, 1, 0) \
+  instantiate_gemv_helper(name, itype, bm, 1, 1, bn, tm, tn, 1, 1) // clang-format on

 // clang-format off
 #define instantiate_gemv_blocks(name, itype) \
@@ -423,11 +533,13 @@ instantiate_gemv_blocks(bfloat16, bfloat16_t);

 template <
    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN> /* Thread cols (in elements) */
-[[kernel, max_total_threads_per_threadgroup(BM* BN)]] void gemv_bs(
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_gather(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
@@ -452,8 +564,9 @@ template <
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
-  using gemv_kernel = GEMVKernel<T, BM, BN, TM, TN, false>;
-  threadgroup T tgp_memory[gemv_kernel::tgp_mem_size];
+  using gemv_kernel = GEMVKernel<T, BM, BN, SM, SN, TM, TN, false>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  uint32_t indx_vec;
  uint32_t indx_mat;
@@ -501,47 +614,47 @@ template <
      alpha,
      beta,
      batch_ndim, // Not used
-      tgp_memory,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
 }

-#define instantiate_gemv_bs_helper(nm, itype, bm, bn, tm, tn)       \
-  template [[host_name("gemv_bs_" #nm "_bm" #bm "_bn" #bn "_tm" #tm \
-                       "_tn" #tn)]] [[kernel]] void                 \
-  gemv_bs<itype, bm, bn, tm, tn>(                                   \
-      const device itype* mat [[buffer(0)]],                        \
-      const device itype* in_vec [[buffer(1)]],                     \
-      const device itype* bias [[buffer(2)]],                       \
-      device itype* out_vec [[buffer(3)]],                          \
-      const constant int& in_vec_size [[buffer(4)]],                \
-      const constant int& out_vec_size [[buffer(5)]],               \
-      const constant int& marix_ld [[buffer(6)]],                   \
-      const constant float& alpha [[buffer(7)]],                    \
-      const constant float& beta [[buffer(8)]],                     \
-      const constant int& batch_ndim [[buffer(9)]],                 \
-      const constant int* batch_shape [[buffer(10)]],               \
-      const constant size_t* index_batch_strides [[buffer(11)]],    \
-      const constant int& vector_batch_ndim [[buffer(12)]],         \
-      const constant int* vector_batch_shape [[buffer(13)]],        \
-      const constant size_t* vector_batch_stride [[buffer(14)]],    \
-      const constant int& matrix_batch_ndim [[buffer(15)]],         \
-      const constant int* matrix_batch_shape [[buffer(16)]],        \
-      const constant size_t* matrix_batch_stride [[buffer(17)]],    \
-      const constant uint32_t* vec_indices [[buffer(18)]],          \
-      const constant uint32_t* mat_indices [[buffer(19)]],          \
-      uint3 tid [[threadgroup_position_in_grid]],                   \
-      uint3 lid [[thread_position_in_threadgroup]],                 \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],             \
+#define instantiate_gemv_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn)   \
+  template [[host_name("gemv_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
+                       "_sn" #sn "_tm" #tm "_tn" #tn)]] [[kernel]] void \
+  gemv_gather<itype, bm, bn, sm, sn, tm, tn>(                           \
+      const device itype* mat [[buffer(0)]],                            \
+      const device itype* in_vec [[buffer(1)]],                         \
+      const device itype* bias [[buffer(2)]],                           \
+      device itype* out_vec [[buffer(3)]],                              \
+      const constant int& in_vec_size [[buffer(4)]],                    \
+      const constant int& out_vec_size [[buffer(5)]],                   \
+      const constant int& marix_ld [[buffer(6)]],                       \
+      const constant float& alpha [[buffer(7)]],                        \
+      const constant float& beta [[buffer(8)]],                         \
+      const constant int& batch_ndim [[buffer(9)]],                     \
+      const constant int* batch_shape [[buffer(10)]],                   \
+      const constant size_t* index_batch_strides [[buffer(11)]],        \
+      const constant int& vector_batch_ndim [[buffer(12)]],             \
+      const constant int* vector_batch_shape [[buffer(13)]],            \
+      const constant size_t* vector_batch_stride [[buffer(14)]],        \
+      const constant int& matrix_batch_ndim [[buffer(15)]],             \
+      const constant int* matrix_batch_shape [[buffer(16)]],            \
+      const constant size_t* matrix_batch_stride [[buffer(17)]],        \
+      const constant uint32_t* vec_indices [[buffer(18)]],              \
+      const constant uint32_t* mat_indices [[buffer(19)]],              \
+      uint3 tid [[threadgroup_position_in_grid]],                       \
+      uint3 lid [[thread_position_in_threadgroup]],                     \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],                 \
      uint simd_lid [[thread_index_in_simdgroup]]);

 // clang-format off
 #define instantiate_gemv_bs_blocks(name, itype)        \
-  instantiate_gemv_bs_helper(name, itype, 4, 32, 1, 4) \
-  instantiate_gemv_bs_helper(name, itype, 4, 32, 4, 4) \
-  instantiate_gemv_bs_helper(name, itype, 8, 32, 4, 4) // clang-format on
+  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 1, 4) \
+  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 4, 4) \
+  instantiate_gemv_bs_helper(name, itype, 8, 1, 1, 32, 4, 4) // clang-format on

 instantiate_gemv_bs_blocks(float32, float);
 instantiate_gemv_bs_blocks(float16, half);
@@ -553,13 +666,15 @@ instantiate_gemv_bs_blocks(bfloat16, bfloat16_t);

 template <
    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch, /* Batch ndim > 1 */
    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
-[[kernel, max_total_threads_per_threadgroup(BM* BN)]] void gemv_t(
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_t(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
@@ -579,8 +694,9 @@ template <
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
-  using gemv_kernel = GEMVTKernel<T, BM, BN, TM, TN, kDoAxpby>;
-  threadgroup T tgp_memory[gemv_kernel::tgp_mem_size];
+  using gemv_kernel = GEMVTKernel<T, BM, BN, SM, SN, TM, TN, kDoAxpby>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  // Update batch offsets
  if (kDoNCBatch) {
@@ -613,17 +729,19 @@ template <
      alpha,
      beta,
      bias_stride,
-      tgp_memory,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
 }

-#define instantiate_gemv_t_helper(name, itype, bm, bn, tm, tn, nc, axpby)      \
-  template [[host_name("gemv_t_" #name "_bm" #bm "_bn" #bn "_tm" #tm "_tn" #tn \
-                       "_nc" #nc "_axpby" #axpby)]] [[kernel]] void            \
-  gemv_t<itype, bm, bn, tm, tn, nc, axpby>(                                    \
+#define instantiate_gemv_t_helper(                                             \
+    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                            \
+  template [[host_name("gemv_t_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
+                       "_tm" #tm "_tn" #tn "_nc" #nc                           \
+                       "_axpby" #axpby)]] [[kernel]] void                      \
+  gemv_t<itype, bm, bn, sm, sn, tm, tn, nc, axpby>(                            \
      const device itype* mat [[buffer(0)]],                                   \
      const device itype* in_vec [[buffer(1)]],                                \
      const device itype* bias [[buffer(2)]],                                  \
@@ -645,20 +763,19 @@ template <
      uint simd_lid [[thread_index_in_simdgroup]]);

 // clang-format off
-#define instantiate_gemv_t(name, itype, bm, bn, tm, tn)        \
-  instantiate_gemv_t_helper(name, itype, bm, bn, tm, tn, 0, 0) \
-  instantiate_gemv_t_helper(name, itype, bm, bn, tm, tn, 0, 1) \
-  instantiate_gemv_t_helper(name, itype, bm, bn, tm, tn, 1, 0) \
-  instantiate_gemv_t_helper(name, itype, bm, bn, tm, tn, 1, 1) // clang-format on
+#define instantiate_gemv_t(name, itype, bm, bn, sm, sn, tm, tn)        \
+  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 0) \
+  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 1) \
+  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 0) \
+  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 1) // clang-format on

 // clang-format off
 #define instantiate_gemv_t_blocks(name, itype) \
-  instantiate_gemv_t(name, itype, 8, 8, 4, 1)  \
-  instantiate_gemv_t(name, itype, 8, 8, 4, 4)  \
-  instantiate_gemv_t(name, itype, 8, 16, 4, 4) \
-  instantiate_gemv_t(name, itype, 8, 32, 4, 4) \
-  instantiate_gemv_t(name, itype, 8, 64, 4, 4) \
-  instantiate_gemv_t(name, itype, 8, 128, 4, 4) // clang-format on
+  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 1) \
+  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 4) \
+  instantiate_gemv_t(name, itype, 1, 4,  8, 4, 4, 4) \
+  instantiate_gemv_t(name, itype, 1, 16, 8, 4, 4, 4) \
+  instantiate_gemv_t(name, itype, 1, 16, 4, 8, 4, 4) // clang-format on

 // clang-format off
 instantiate_gemv_t_blocks(float32, float);
@@ -667,11 +784,13 @@ instantiate_gemv_t_blocks(bfloat16, bfloat16_t); // clang-format on

 template <
    typename T,
-    const int BM, /* Threadgroup rows (in threads) */
-    const int BN, /* Threadgroup cols (in threads) */
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN> /* Thread cols (in elements) */
-[[kernel, max_total_threads_per_threadgroup(BM* BN)]] void gemv_t_bs(
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_t_gather(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
@@ -696,8 +815,9 @@ template <
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
-  using gemv_kernel = GEMVTKernel<T, BM, BN, TM, TN, false>;
-  threadgroup T tgp_memory[gemv_kernel::tgp_mem_size];
+  using gemv_kernel = GEMVTKernel<T, BM, BN, SM, SN, TM, TN, false>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  uint32_t indx_vec;
  uint32_t indx_mat;
@@ -745,50 +865,49 @@ template <
      alpha,
      beta,
      batch_ndim, // Not used,
-      tgp_memory,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
 }

-#define instantiate_gemv_t_bs_helper(nm, itype, bm, bn, tm, tn)       \
-  template [[host_name("gemv_t_bs_" #nm "_bm" #bm "_bn" #bn "_tm" #tm \
-                       "_tn" #tn)]] [[kernel]] void                   \
-  gemv_t_bs<itype, bm, bn, tm, tn>(                                   \
-      const device itype* mat [[buffer(0)]],                          \
-      const device itype* in_vec [[buffer(1)]],                       \
-      const device itype* bias [[buffer(2)]],                         \
-      device itype* out_vec [[buffer(3)]],                            \
-      const constant int& in_vec_size [[buffer(4)]],                  \
-      const constant int& out_vec_size [[buffer(5)]],                 \
-      const constant int& marix_ld [[buffer(6)]],                     \
-      const constant float& alpha [[buffer(7)]],                      \
-      const constant float& beta [[buffer(8)]],                       \
-      const constant int& batch_ndim [[buffer(9)]],                   \
-      const constant int* batch_shape [[buffer(10)]],                 \
-      const constant size_t* index_batch_strides [[buffer(11)]],      \
-      const constant int& vector_batch_ndim [[buffer(12)]],           \
-      const constant int* vector_batch_shape [[buffer(13)]],          \
-      const constant size_t* vector_batch_stride [[buffer(14)]],      \
-      const constant int& matrix_batch_ndim [[buffer(15)]],           \
-      const constant int* matrix_batch_shape [[buffer(16)]],          \
-      const constant size_t* matrix_batch_stride [[buffer(17)]],      \
-      const constant uint32_t* vec_indices [[buffer(18)]],            \
-      const constant uint32_t* mat_indices [[buffer(19)]],            \
-      uint3 tid [[threadgroup_position_in_grid]],                     \
-      uint3 lid [[thread_position_in_threadgroup]],                   \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],               \
+#define instantiate_gemv_t_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn)   \
+  template [[host_name("gemv_t_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
+                       "_sn" #sn "_tm" #tm "_tn" #tn)]] [[kernel]] void   \
+  gemv_t_gather<itype, bm, bn, sm, sn, tm, tn>(                           \
+      const device itype* mat [[buffer(0)]],                              \
+      const device itype* in_vec [[buffer(1)]],                           \
+      const device itype* bias [[buffer(2)]],                             \
+      device itype* out_vec [[buffer(3)]],                                \
+      const constant int& in_vec_size [[buffer(4)]],                      \
+      const constant int& out_vec_size [[buffer(5)]],                     \
+      const constant int& marix_ld [[buffer(6)]],                         \
+      const constant float& alpha [[buffer(7)]],                          \
+      const constant float& beta [[buffer(8)]],                           \
+      const constant int& batch_ndim [[buffer(9)]],                       \
+      const constant int* batch_shape [[buffer(10)]],                     \
+      const constant size_t* index_batch_strides [[buffer(11)]],          \
+      const constant int& vector_batch_ndim [[buffer(12)]],               \
+      const constant int* vector_batch_shape [[buffer(13)]],              \
+      const constant size_t* vector_batch_stride [[buffer(14)]],          \
+      const constant int& matrix_batch_ndim [[buffer(15)]],               \
+      const constant int* matrix_batch_shape [[buffer(16)]],              \
+      const constant size_t* matrix_batch_stride [[buffer(17)]],          \
+      const constant uint32_t* vec_indices [[buffer(18)]],                \
+      const constant uint32_t* mat_indices [[buffer(19)]],                \
+      uint3 tid [[threadgroup_position_in_grid]],                         \
+      uint3 lid [[thread_position_in_threadgroup]],                       \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],                   \
      uint simd_lid [[thread_index_in_simdgroup]]);

 // clang-format off
-#define instantiate_gemv_t_bs_blocks(name, itype) \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 8, 4, 1)  \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 8, 4, 4)  \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 16, 4, 4) \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 32, 4, 4) \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 64, 4, 4) \
-  instantiate_gemv_t_bs_helper(name, itype, 8, 128, 4, 4) // clang-format on
+#define instantiate_gemv_t_bs_blocks(name, itype)              \
+  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 1) \
+  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 4) \
+  instantiate_gemv_t_bs_helper(name, itype, 1,  4, 8, 4, 4, 4) \
+  instantiate_gemv_t_bs_helper(name, itype, 1, 16, 8, 4, 4, 4) \
+  instantiate_gemv_t_bs_helper(name, itype, 1, 16, 4, 8, 4, 4) // clang-format on

 // clang-format off
 instantiate_gemv_t_bs_blocks(float32, float);
--- a/mlx/backend/metal/kernels/gemv_masked.metal
+++ b/mlx/backend/metal/kernels/gemv_masked.metal
@@ -0,0 +1,939 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <metal_simdgroup>
+#include <metal_stdlib>
+
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels/utils.h"
+
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
+using namespace metal;
+
+///////////////////////////////////////////////////////////////////////////////
+/// Matrix vector multiplication
+///////////////////////////////////////////////////////////////////////////////
+
+#define MLX_MTL_CONST static constant constexpr const
+
+struct _NoMask {
+  char x;
+
+  constexpr METAL_FUNC operator bool() {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const threadgroup {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const device {
+    return true;
+  }
+  constexpr METAL_FUNC operator bool() const constant {
+    return true;
+  }
+};
+
+typedef struct _NoMask nomask_t;
+
+template <typename OutT, typename InT = OutT>
+struct ScaleOp {
+  OutT scale;
+
+  METAL_FUNC OutT apply(InT x) const {
+    return static_cast<OutT>(x) * scale;
+  }
+};
+
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN> /* Thread cols (in elements) */
+struct GEMVKernel {
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;
+
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+
+  static_assert(
+      SN == 8 || SN == 16 || SN == 32,
+      "gemv block must have a width of 8, 16, or 32");
+
+  static_assert(blockN >= blockM, "Masked gemv must have blockN >= blockM");
+
+  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+
+  MLX_MTL_CONST bool has_mul_operand_mask =
+      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
+  MLX_MTL_CONST bool has_mul_output_mask =
+      has_output_mask && !metal::is_same_v<out_mask_t, bool>;
+
+  // - The matrix of size (M = out_vec_size, K = in_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
+  // - Every thread works on a block of (TM, TN)
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
+  //
+  // 1. A thread loads TN elements each from mat along TM rows
+  //    and the corresponding scalar from the vector
+  // 2. The thread then multiplies and adds to accumulate its local result for
+  //    the block
+  // 3. At the end, each thread has accumulated results over all blocks across
+  //    the rows. These are then summed up across the threadgroup
+  // 4. Each threadgroup writes its accumulated blockM outputs
+  //
+  // Edge case handling:
+  // - The threadgroup with the largest tid has blocks that exceed the matrix
+  //   * The blocks that start outside the matrix are never read (thread results
+  //     remain zero)
+  //   * The last thread that partially overlaps with the matrix is shifted
+  //     inwards such that the thread block fits exactly in the matrix
+
+  MLX_MTL_CONST short tgp_mem_size = BN > 1 ? BN*(blockM + TM) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BN > 1;
+
+  static METAL_FUNC void
+  load_unsafe(const device T* src, thread T dst[TN], const int src_offset = 0) {
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tn = 0; tn < TN; tn++) {
+      dst[tn] = src[src_offset + tn];
+    }
+  }
+
+  static METAL_FUNC void load_safe(
+      const device T* src,
+      thread T dst[TN],
+      const int src_offset = 0,
+      const int src_size = TN) {
+    if (src_offset + TN <= src_size) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = src[src_offset + tn];
+      }
+    } else { // Edgecase
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        dst[tn] = src_offset + tn < src_size ? src[src_offset + tn] : 0;
+      }
+    }
+  }
+
+  static METAL_FUNC void run(
+      const device T* mat [[buffer(0)]],
+      const device T* in_vec [[buffer(1)]],
+      device T* out_vec [[buffer(3)]],
+      const constant int& in_vec_size [[buffer(4)]],
+      const constant int& out_vec_size [[buffer(5)]],
+      const constant int& matrix_ld [[buffer(6)]],
+      const device out_mask_t* out_mask [[buffer(20)]],
+      const device op_mask_t* mat_mask [[buffer(21)]],
+      const device op_mask_t* vec_mask [[buffer(22)]],
+      const constant int* mask_strides [[buffer(23)]],
+      threadgroup T* tgp_memory [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint simd_gid [[simdgroup_index_in_threadgroup]],
+      uint simd_lid [[thread_index_in_simdgroup]]) {
+    // Appease compiler
+    (void)lid;
+
+    // Thread local accumulation results
+    thread T result[TM] = {0};
+    thread T inter[TN];
+    thread T v_coeff[TN];
+
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);
+
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+
+    const int simdM = BN != 1 ? SM * (simd_gid / BN) : int(SM * simd_gid);
+    const int simdN = BN != 1 ? SN * (simd_gid % BN) : 0;
+
+    int bm = (simdM + thrM) * TM;
+    int bn = (simdN + thrN) * TN;
+
+    // Block position
+    int out_row = tid.x * blockM + bm;
+
+    // Exit simdgroup if rows out of bound
+    if (out_row >= out_vec_size)
+      return;
+
+    // Adjust tail simdgroup to ensure in bound reads
+    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;
+
+    // Prepare mask offsets
+    const constant int* out_mask_strides = mask_strides;
+    const constant int* mat_mask_strides =
+        mask_strides + (has_output_mask ? 2 : 0);
+    const constant int* vec_mask_strides =
+        mat_mask_strides + (has_operand_mask ? 2 : 0);
+
+    const int m_block_idx = blockN > blockM ? out_row / blockN : int(tid.x);
+
+    const int out_mask_offset =
+        !has_output_mask ? 0 : m_block_idx * out_mask_strides[1];
+
+    int mat_mask_offset =
+        !has_operand_mask ? 0 : m_block_idx * mat_mask_strides[1];
+    int vec_mask_offset = 0;
+    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[0];
+    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[1];
+
+    T out_scale{1};
+
+    // Check output mask
+    if (has_output_mask) {
+      auto mask_out = out_mask[out_mask_offset];
+
+      // Write zeros and return if mask is 0
+      if (!mask_out) {
+        if (simdN == 0 && thrN == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            out_vec[out_row + tm] = T(0.);
+          }
+        }
+
+        return;
+      }
+
+      // Store scalar if multiplicative mask
+      if (has_mul_output_mask) {
+        out_scale = T(mask_out);
+      }
+    }
+
+    // Advance matrix
+    mat += out_row * matrix_ld;
+
+    // Prepare for loop
+    constexpr const uniform<int> loop_stride = make_uniform(blockN);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;
+
+    // Loop over in_vec in blocks of blockN
+    for (int i = 0; i < n_iter; ++i) {
+      if (!has_operand_mask ||
+          (bool(mat_mask[mat_mask_offset]) &&
+           bool(vec_mask[vec_mask_offset]))) {
+        T block_scale{1};
+        if (has_mul_operand_mask) {
+          block_scale =
+              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+        }
+
+        load_unsafe(in_vec, v_coeff, bn);
+
+        // Apply scale
+        if (has_mul_operand_mask) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            v_coeff[tn] *= block_scale;
+          }
+        }
+
+        // Per thread work loop
+        int mat_offset = 0;
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          // Load for the row
+          load_unsafe(mat, inter, mat_offset + bn);
+
+          // Accumulate results
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            result[tm] += inter[tn] * v_coeff[tn];
+          }
+
+          mat_offset += matrix_ld;
+        }
+      }
+
+      bn += blockN;
+      mat_mask_offset += mat_mask_step;
+      vec_mask_offset += vec_mask_step;
+    }
+
+    if (leftover > 0 &&
+        (!has_operand_mask ||
+         (bool(mat_mask[mat_mask_offset]) &&
+          bool(vec_mask[vec_mask_offset])))) {
+      T block_scale{1};
+      if (has_mul_operand_mask) {
+        block_scale =
+            T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+      }
+
+      load_safe(in_vec, v_coeff, bn, in_size);
+
+      // Apply scale
+      if (has_mul_operand_mask) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          v_coeff[tn] *= block_scale;
+        }
+      }
+
+      // Per thread work loop
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        // Load for the row
+        load_safe(&mat[tm * matrix_ld], inter, bn, in_size);
+
+        // Accumulate results
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          result[tm] += inter[tn] * v_coeff[tn];
+        }
+      }
+    }
+
+    // Apply out scale
+    if (has_mul_output_mask) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        result[tm] *= out_scale;
+      }
+    }
+
+    // Simdgroup accumulations
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tm = 0; tm < TM; tm++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sn = (SN / 2); sn >= 1; sn >>= 1) {
+        result[tm] += simd_shuffle_down(result[tm], sn);
+      }
+    }
+
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup T* tgp_results = tgp_memory + sgN * (blockM + TM) + bm;
+      if (thrN == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tm = 0; tm < TM; tm++) {
+          tgp_results[tm] = result[tm];
+        }
+
+        threadgroup_barrier(mem_flags::mem_none);
+
+        if (sgN == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgn = 1; sgn < BN; sgn++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tm = 0; tm < TM; tm++) {
+              result[tm] += tgp_results[sgn * (blockM + TM) + tm];
+            }
+          }
+        }
+      }
+    }
+
+    // Write outputs
+    if (simdN == 0 && thrN == 0) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tm = 0; tm < TM; tm++) {
+        out_vec[out_row + tm] = result[tm];
+      }
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/// Vector matrix multiplication
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN> /* Thread cols (in elements) */
+struct GEMVTKernel {
+  MLX_MTL_CONST int threadsM = BM * SM;
+  MLX_MTL_CONST int threadsN = BN * SN;
+
+  MLX_MTL_CONST int blockM = threadsM * TM;
+  MLX_MTL_CONST int blockN = threadsN * TN;
+
+  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");
+
+  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+
+  MLX_MTL_CONST bool has_mul_operand_mask =
+      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
+  MLX_MTL_CONST bool has_mul_output_mask =
+      has_output_mask && !metal::is_same_v<out_mask_t, bool>;
+
+  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
+  //   into blocks of (blockM, blockN) divided among threadgroups
+  // - Every thread works on a block of (TM, TN)
+  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
+  //
+  // 1. A thread loads TN elements each from mat along TM contiguous rows
+  //    and the corresponding scalar from the vector
+  // 2. The thread then accumulates its local result for the block
+  // 3. At the end, each thread has accumulated results over all blocks across
+  //    the rows. These are then summed up across the threadgroup
+  // 4. Each threadgroup writes its accumulated BN * TN outputs
+  //
+  // Edge case handling:
+  // - The threadgroup with the largest tid has blocks that exceed the matrix
+  //   * The blocks that start outside the matrix are never read (thread results
+  //     remain zero)
+  //   * The last thread that partially overlaps with the matrix is shifted
+  //     inwards such that the thread block fits exactly in the matrix
+
+  MLX_MTL_CONST short tgp_mem_size = BM > 1 ? BM*(blockN + TN) : 0;
+  MLX_MTL_CONST bool needs_tgp_reduction = BM > 1;
+
+  static METAL_FUNC void run(
+      const device T* mat [[buffer(0)]],
+      const device T* in_vec [[buffer(1)]],
+      device T* out_vec [[buffer(3)]],
+      const constant int& in_vec_size [[buffer(4)]],
+      const constant int& out_vec_size [[buffer(5)]],
+      const constant int& marix_ld [[buffer(6)]],
+      const device out_mask_t* out_mask [[buffer(20)]],
+      const device op_mask_t* mat_mask [[buffer(21)]],
+      const device op_mask_t* vec_mask [[buffer(22)]],
+      const constant int* mask_strides [[buffer(23)]],
+      threadgroup T* tgp_memory [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint simd_gid [[simdgroup_index_in_threadgroup]],
+      uint simd_lid [[thread_index_in_simdgroup]]) {
+    // Appease compiler
+    (void)lid;
+
+    // Thread local accumulation results
+    T result[TN] = {0};
+    T inter[TN];
+    T v_coeff[TM];
+
+    const int thrM = SN != 32 ? simd_lid / SN : 0;
+    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);
+
+    const int sgM = BN != 1 ? (simd_gid / BN) : int(simd_gid);
+    const int sgN = BN != 1 ? (simd_gid % BN) : 0;
+
+    const int simdM = SM * sgM;
+    const int simdN = SN * sgN;
+
+    int cm = (simdM + thrM);
+    int cn = (simdN + thrN);
+
+    int bm = cm * TM;
+    int bn = cn * TN;
+
+    int out_col = tid.x * blockN + bn;
+
+    // Prepare mask offsets
+    const constant int* out_mask_strides = mask_strides;
+    const constant int* mat_mask_strides =
+        out_mask_strides + (has_output_mask ? 2 : 0);
+    const constant int* vec_mask_strides =
+        mat_mask_strides + (has_operand_mask ? 2 : 0);
+
+    const int n_block_idx = blockM > blockN ? out_col / blockM : int(tid.x);
+
+    const int out_mask_offset =
+        !has_output_mask ? 0 : n_block_idx; // * out_mask_strides[0];
+
+    int mat_mask_offset =
+        !has_operand_mask ? 0 : n_block_idx * mat_mask_strides[0];
+    int vec_mask_offset = 0;
+    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[1];
+    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[0];
+
+    T out_scale{1};
+
+    // Check output mask
+    if (has_output_mask) {
+      auto mask_out = out_mask[out_mask_offset];
+
+      // Write zeros and return if mask is 0
+      if (!mask_out) {
+        if (cm == 0 && out_col < out_vec_size) {
+          if (out_col + TN <= out_vec_size) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              out_vec[out_col + tn] = T(0.);
+            }
+          } else {
+            for (int tn = 0; tn < TN && (out_col + tn) < out_vec_size; tn++) {
+              out_vec[out_col + tn] = T(0.);
+            }
+          }
+        }
+
+        return;
+      }
+
+      // Store scalar if multiplicative mask
+      if (has_mul_output_mask) {
+        out_scale = T(mask_out);
+      }
+    }
+
+    // Prepare for loop
+    constexpr const uniform<int> loop_stride = make_uniform(blockM);
+    const uniform<int> in_size = make_uniform(in_vec_size);
+    const uniform<int> n_iter = in_size / loop_stride;
+    const uniform<int> last_iter = loop_stride * n_iter;
+    const uniform<int> leftover = in_size - last_iter;
+
+    // Edgecase handling
+    if (out_col < out_vec_size) {
+      out_col = (out_col + TN) <= out_vec_size ? out_col : out_vec_size - TN;
+
+      // Per thread accumulation main loop
+      for (int i = 0; i < n_iter; ++i) {
+        // Adding a threadgroup_barrier improves performance slightly
+        // This is possibly it may help exploit cache better
+        threadgroup_barrier(mem_flags::mem_none);
+
+        if (!has_operand_mask ||
+            (bool(mat_mask[mat_mask_offset]) &&
+             bool(vec_mask[vec_mask_offset]))) {
+          T block_scale{1};
+          if (has_mul_operand_mask) {
+            block_scale =
+                T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+          }
+
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            v_coeff[tm] = in_vec[bm + tm];
+          }
+
+          // Apply scale
+          if (has_mul_operand_mask) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tm = 0; tm < TM; tm++) {
+              v_coeff[tm] *= block_scale;
+            }
+          }
+
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tm = 0; tm < TM; tm++) {
+            for (int tn = 0; tn < TN; tn++) {
+              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
+            }
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += v_coeff[tm] * inter[tn];
+            }
+          }
+        }
+
+        bm += blockM;
+        mat_mask_offset += mat_mask_step;
+        vec_mask_offset += vec_mask_step;
+      }
+
+      if (leftover > 0 &&
+          (!has_operand_mask ||
+           (bool(mat_mask[mat_mask_offset]) &&
+            bool(vec_mask[vec_mask_offset])))) {
+        T block_scale{1};
+        if (has_mul_operand_mask) {
+          block_scale =
+              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
+        }
+
+        for (int tm = 0; tm < TM && bm + tm < in_vec_size; tm++) {
+          v_coeff[tm] = in_vec[bm + tm];
+
+          if (has_mul_operand_mask) {
+            v_coeff[tm] *= block_scale;
+          }
+
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
+          }
+
+          MLX_MTL_PRAGMA_UNROLL
+          for (int tn = 0; tn < TN; tn++) {
+            result[tn] += v_coeff[tm] * inter[tn];
+          }
+        }
+      }
+    }
+
+    // Apply out scale
+    if (has_mul_output_mask) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int tn = 0; tn < TN; tn++) {
+        result[tn] *= out_scale;
+      }
+    }
+
+    // Simdgroup accumulations
+    MLX_MTL_PRAGMA_UNROLL
+    for (int tn = 0; tn < TN; tn++) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (ushort sm = (SM / 2); sm >= 1; sm >>= 1) {
+        result[tn] += simd_shuffle_down(result[tn], SN * sm);
+      }
+    }
+
+    // Threadgroup accumulation results
+    if (needs_tgp_reduction) {
+      threadgroup T* tgp_results = tgp_memory + sgM * (blockN + TN) + bn;
+      if (thrM == 0) {
+        MLX_MTL_PRAGMA_UNROLL
+        for (int tn = 0; tn < TN; tn++) {
+          tgp_results[tn] = result[tn];
+        }
+
+        threadgroup_barrier(mem_flags::mem_none);
+
+        if (sgM == 0) {
+          MLX_MTL_PRAGMA_UNROLL
+          for (int sgm = 1; sgm < BM; sgm++) {
+            MLX_MTL_PRAGMA_UNROLL
+            for (int tn = 0; tn < TN; tn++) {
+              result[tn] += tgp_results[sgm * (blockN + TN) + tn];
+            }
+          }
+        }
+      }
+    }
+
+    // Threadgroup accumulation and writing out results
+    if (cm == 0 && out_col < out_vec_size) {
+      MLX_MTL_PRAGMA_UNROLL
+      for (int j = 0; j < TN; j++) {
+        out_vec[out_col + j] = result[j];
+      }
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/// Matrix vector multiplication
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    const bool kDoNCBatch> /* Batch ndim > 1 */
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_masked(
+    const device T* mat [[buffer(0)]],
+    const device T* in_vec [[buffer(1)]],
+    device T* out_vec [[buffer(3)]],
+    const constant int& in_vec_size [[buffer(4)]],
+    const constant int& out_vec_size [[buffer(5)]],
+    const constant int& marix_ld [[buffer(6)]],
+    const constant int& batch_ndim [[buffer(9)]],
+    const constant int* batch_shape [[buffer(10)]],
+    const constant size_t* vector_batch_stride [[buffer(11)]],
+    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const device out_mask_t* out_mask [[buffer(20)]],
+    const device op_mask_t* mat_mask [[buffer(21)]],
+    const device op_mask_t* vec_mask [[buffer(22)]],
+    const constant int* mask_strides [[buffer(23)]],
+    const constant size_t* mask_batch_strides [[buffer(24)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  using gemv_kernel =
+      GEMVKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];
+
+  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+
+  // Update batch offsets
+  if (kDoNCBatch) {
+    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
+    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);
+
+    if (has_output_mask) {
+      out_mask +=
+          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
+      mask_batch_strides += batch_ndim;
+    }
+
+    if (has_operand_mask) {
+      const constant size_t* mask_strides_mat = mask_batch_strides;
+      const constant size_t* mask_strides_vec = mask_strides_mat + batch_ndim;
+
+      ulong2 batch_offsets = elem_to_loc_broadcast(
+          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
+
+      mat_mask += batch_offsets.x;
+      vec_mask += batch_offsets.y;
+    }
+
+  } else {
+    in_vec += tid.z * vector_batch_stride[0];
+    mat += tid.z * matrix_batch_stride[0];
+
+    if (has_output_mask) {
+      out_mask += tid.z * mask_batch_strides[0];
+      mask_batch_strides += batch_ndim;
+    }
+
+    if (has_operand_mask) {
+      mat_mask += tid.z * mask_batch_strides[0];
+      vec_mask += tid.z * mask_batch_strides[batch_ndim];
+    }
+  }
+
+  out_vec += tid.z * out_vec_size;
+
+  gemv_kernel::run(
+      mat,
+      in_vec,
+      out_vec,
+      in_vec_size,
+      out_vec_size,
+      marix_ld,
+      out_mask,
+      mat_mask,
+      vec_mask,
+      mask_strides,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
+      tid,
+      lid,
+      simd_gid,
+      simd_lid);
+}
+
+#define instantiate_gemv_helper(                                           \
+    outm_n, outm_t, opm_n, opm_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  template [[host_name("gemv_outmask_" #outm_n "_opmask_" #opm_n "_" #name \
+                       "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm   \
+                       "_tn" #tn "_nc" #nc)]] [[kernel]] void              \
+  gemv_masked<itype, outm_t, opm_t, bm, bn, sm, sn, tm, tn, nc>(           \
+      const device itype* mat [[buffer(0)]],                               \
+      const device itype* in_vec [[buffer(1)]],                            \
+      device itype* out_vec [[buffer(3)]],                                 \
+      const constant int& in_vec_size [[buffer(4)]],                       \
+      const constant int& out_vec_size [[buffer(5)]],                      \
+      const constant int& marix_ld [[buffer(6)]],                          \
+      const constant int& batch_ndim [[buffer(9)]],                        \
+      const constant int* batch_shape [[buffer(10)]],                      \
+      const constant size_t* vector_batch_stride [[buffer(11)]],           \
+      const constant size_t* matrix_batch_stride [[buffer(12)]],           \
+      const device outm_t* out_mask [[buffer(20)]],                        \
+      const device opm_t* mat_mask [[buffer(21)]],                         \
+      const device opm_t* vec_mask [[buffer(22)]],                         \
+      const constant int* mask_strides [[buffer(23)]],                     \
+      const constant size_t* mask_batch_strides [[buffer(24)]],            \
+      uint3 tid [[threadgroup_position_in_grid]],                          \
+      uint3 lid [[thread_position_in_threadgroup]],                        \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],                    \
+      uint simd_lid [[thread_index_in_simdgroup]]);
+
+// clang-format off
+#define instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_helper(bool_, bool, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_helper(name, itype, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_helper(bool_, bool, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_helper(name, itype, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_helper(nomask, nomask_t, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_helper(nomask, nomask_t, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_helper(bool_, bool, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_helper(name, itype, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) // clang-format on
+
+// clang-format off
+#define instantiate_gemv(name, itype, bm, bn, sm, sn, tm, tn)         \
+  instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, 0) \
+  instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, 1) // clang-format on
+
+// clang-format off
+#define instantiate_gemv_blocks(name, itype) \
+  instantiate_gemv(name, itype, 2, 1, 4,  8, 1, 4) \
+  instantiate_gemv(name, itype, 2, 1, 4,  8, 4, 4) \
+  instantiate_gemv(name, itype, 2, 1, 2, 16, 1, 4) \
+  instantiate_gemv(name, itype, 2, 1, 2, 16, 4, 4) \
+  instantiate_gemv(name, itype, 4, 1, 2, 16, 4, 4) // clang-format on
+
+instantiate_gemv_blocks(float32, float);
+instantiate_gemv_blocks(float16, half);
+instantiate_gemv_blocks(bfloat16, bfloat16_t);
+
+///////////////////////////////////////////////////////////////////////////////
+/// Vector matrix multiplication
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename T,
+    typename out_mask_t,
+    typename op_mask_t,
+    const int BM, /* Threadgroup rows (in simdgroups) */
+    const int BN, /* Threadgroup cols (in simdgroups) */
+    const int SM, /* Simdgroup rows (in threads) */
+    const int SN, /* Simdgroup cols (in threads) */
+    const int TM, /* Thread rows (in elements) */
+    const int TN, /* Thread cols (in elements) */
+    const bool kDoNCBatch> /* Batch ndim > 1 */
+[[kernel, max_total_threads_per_threadgroup(BM* BN * 32)]] void gemv_t_masked(
+    const device T* mat [[buffer(0)]],
+    const device T* in_vec [[buffer(1)]],
+    device T* out_vec [[buffer(3)]],
+    const constant int& in_vec_size [[buffer(4)]],
+    const constant int& out_vec_size [[buffer(5)]],
+    const constant int& marix_ld [[buffer(6)]],
+    const constant int& batch_ndim [[buffer(9)]],
+    const constant int* batch_shape [[buffer(10)]],
+    const constant size_t* vector_batch_stride [[buffer(11)]],
+    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const device out_mask_t* out_mask [[buffer(20)]],
+    const device op_mask_t* mat_mask [[buffer(21)]],
+    const device op_mask_t* vec_mask [[buffer(22)]],
+    const constant int* mask_strides [[buffer(23)]],
+    const constant size_t* mask_batch_strides [[buffer(24)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  using gemv_kernel =
+      GEMVTKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
+  threadgroup T tgp_memory
+      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];
+
+  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
+  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;
+
+  // Update batch offsets
+  if (kDoNCBatch) {
+    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
+    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);
+
+    if (has_output_mask) {
+      out_mask +=
+          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
+      mask_batch_strides += batch_ndim;
+    }
+
+    if (has_operand_mask) {
+      const constant size_t* mask_strides_mat = mask_batch_strides;
+      const constant size_t* mask_strides_vec = mask_strides_mat + batch_ndim;
+
+      ulong2 batch_offsets = elem_to_loc_broadcast(
+          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
+
+      mat_mask += batch_offsets.x;
+      vec_mask += batch_offsets.y;
+    }
+
+  } else {
+    in_vec += tid.z * vector_batch_stride[0];
+    mat += tid.z * matrix_batch_stride[0];
+
+    if (has_output_mask) {
+      out_mask += tid.z * mask_batch_strides[0];
+      mask_batch_strides += batch_ndim;
+    }
+
+    if (has_operand_mask) {
+      mat_mask += tid.z * mask_batch_strides[0];
+      vec_mask += tid.z * mask_batch_strides[batch_ndim];
+    }
+  }
+
+  out_vec += tid.z * out_vec_size;
+
+  gemv_kernel::run(
+      mat,
+      in_vec,
+      out_vec,
+      in_vec_size,
+      out_vec_size,
+      marix_ld,
+      out_mask,
+      mat_mask,
+      vec_mask,
+      mask_strides,
+      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
+      tid,
+      lid,
+      simd_gid,
+      simd_lid);
+}
+
+#define instantiate_gemv_t_helper(                                           \
+    outm_n, outm_t, opm_n, opm_t, name, itype, bm, bn, sm, sn, tm, tn, nc)   \
+  template [[host_name("gemv_t_outmask_" #outm_n "_opmask_" #opm_n "_" #name \
+                       "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm     \
+                       "_tn" #tn "_nc" #nc)]] [[kernel]] void                \
+  gemv_t_masked<itype, outm_t, opm_t, bm, bn, sm, sn, tm, tn, nc>(           \
+      const device itype* mat [[buffer(0)]],                                 \
+      const device itype* in_vec [[buffer(1)]],                              \
+      device itype* out_vec [[buffer(3)]],                                   \
+      const constant int& in_vec_size [[buffer(4)]],                         \
+      const constant int& out_vec_size [[buffer(5)]],                        \
+      const constant int& marix_ld [[buffer(6)]],                            \
+      const constant int& batch_ndim [[buffer(9)]],                          \
+      const constant int* batch_shape [[buffer(10)]],                        \
+      const constant size_t* vector_batch_stride [[buffer(11)]],             \
+      const constant size_t* matrix_batch_stride [[buffer(12)]],             \
+      const device outm_t* out_mask [[buffer(20)]],                          \
+      const device opm_t* mat_mask [[buffer(21)]],                           \
+      const device opm_t* vec_mask [[buffer(22)]],                           \
+      const constant int* mask_strides [[buffer(23)]],                       \
+      const constant size_t* mask_batch_strides [[buffer(24)]],              \
+      uint3 tid [[threadgroup_position_in_grid]],                            \
+      uint3 lid [[thread_position_in_threadgroup]],                          \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],                      \
+      uint simd_lid [[thread_index_in_simdgroup]]);
+
+// clang-format off
+#define instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_t_helper(bool_, bool, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_t_helper(name, itype, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_t_helper(bool_, bool, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_t_helper(name, itype, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
+  instantiate_gemv_t_helper(nomask, nomask_t, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_t_helper(nomask, nomask_t, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_t_helper(bool_, bool, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
+  instantiate_gemv_t_helper(name, itype, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) // clang-format on
+
+// clang-format off
+#define instantiate_gemv_t(name, itype, bm, bn, sm, sn, tm, tn)   \
+  instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, 0) \
+  instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, 1) // clang-format on
+
+// clang-format off
+#define instantiate_gemv_t_blocks(name, itype) \
+  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 4, 1) \
+  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 4) \
+  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 8, 1) \
+  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 8, 4) \
+  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 8, 4) \
+  instantiate_gemv_t(name, itype, 1, 4,  8, 4, 8, 4) // clang-format on
+
+// clang-format off
+instantiate_gemv_t_blocks(float32, float);
+instantiate_gemv_t_blocks(float16, half);
+instantiate_gemv_t_blocks(bfloat16, bfloat16_t); // clang-format on
--- a/mlx/backend/metal/kernels/hadamard.h
+++ b/mlx/backend/metal/kernels/hadamard.h
@@ -0,0 +1,167 @@
+// Copyright © 2024 Apple Inc.
+#include <metal_common>
+#include <metal_compute>
+
+#include "mlx/backend/metal/kernels/steel/defines.h"
+
+using namespace metal;
+
+// Thread local Hadamard transform for 2^R
+template <short R>
+METAL_FUNC void radix_func(thread float* x) {
+  constexpr short logR = __builtin_ctz(R);
+  short h = 1;
+  STEEL_PRAGMA_UNROLL
+  for (short s = 0; s < logR; s++) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < R / 2; i++) {
+      short k = i & (h - 1);
+      short j = ((i - k) << 1) + k;
+      float a = x[j];
+      float b = x[j + h];
+      x[j] = a + b;
+      x[j + h] = a - b;
+    }
+    h <<= 1;
+  }
+}
+
+template <typename T, int N, int max_radix, int read_width>
+[[kernel]] void hadamard_n(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    constant const float& scale,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Compute a Hadamard transform of size N = 2^k
+  //
+  // Equivalent to:
+  //    from scipy.linalg import hadamard
+  //    y = hadamard(len(x)) @ x
+
+  constexpr short num_threads = N / max_radix;
+  constexpr short logN = __builtin_ctz(N);
+  constexpr short logR = __builtin_ctz(max_radix);
+  constexpr short num_steps = logN / logR;
+  constexpr short logFinal = logN % logR;
+  constexpr short final_radix = 1 << (logFinal);
+
+  int batch_idx = elem.x * N;
+  short i = elem.y;
+
+  threadgroup T buf[N];
+
+  // Read values from device
+  STEEL_PRAGMA_UNROLL
+  for (short j = 0; j < max_radix / read_width; j++) {
+    short index = j * read_width * num_threads + i * read_width;
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < read_width; r++) {
+      buf[index + r] = in[batch_idx + index + r];
+    }
+  }
+
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  float x[max_radix];
+  short h = 1;
+
+  STEEL_PRAGMA_UNROLL
+  for (short s = 0; s < num_steps; s++) {
+    short k = i & (h - 1);
+    short j = ((i - k) << logR) + k;
+
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < max_radix; r++) {
+      x[r] = buf[j + h * r];
+    }
+
+    radix_func<max_radix>(x);
+
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < max_radix; r++) {
+      buf[j + h * r] = x[r];
+    }
+
+    h <<= logR;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+
+  // Do the final radix
+  // e.g. max_radix = 16
+  //      N = 1024 = 16 * 16 * 4
+  if (final_radix > 1) {
+    // Each thread does multiple butterflies
+    STEEL_PRAGMA_UNROLL
+    for (int t = 0; t < max_radix / final_radix; t++) {
+      short index = i + t * num_threads;
+      short k = index & (h - 1);
+      short j = ((index - k) << logFinal) + k;
+      STEEL_PRAGMA_UNROLL
+      for (short r = 0; r < final_radix; r++) {
+        x[r] = buf[j + h * r];
+      }
+
+      radix_func<final_radix>(x);
+
+      STEEL_PRAGMA_UNROLL
+      for (short r = 0; r < final_radix; r++) {
+        buf[j + h * r] = x[r];
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+
+  // Write values to device
+  STEEL_PRAGMA_UNROLL
+  for (short j = 0; j < max_radix / read_width; j++) {
+    short index = j * read_width * num_threads + i * read_width;
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < read_width; r++) {
+      out[batch_idx + index + r] = buf[index + r] * scale;
+    }
+  }
+}
+
+template <typename T, int N, int M, int read_width>
+[[kernel]] void hadamard_m(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    constant const float& scale,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Compute a Hadamard transform of size M
+  // using a naive O(M^2) codelet.
+  //
+  // This kernel is the second stage in the computation
+  // of a Hadamard transform of size M*N where N = 2^k.
+
+  int index = elem.x * grid.y + elem.y;
+  short i = index % (N / read_width);
+  int batch_idx = index / (N / read_width) * M * N;
+
+  float x[read_width][M];
+  STEEL_PRAGMA_UNROLL
+  for (short c = 0; c < M; c++) {
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < read_width; r++) {
+      x[r][c] = in[batch_idx + c * N + i * read_width + r];
+    }
+  }
+
+  STEEL_PRAGMA_UNROLL
+  for (short r = 0; r < read_width; r++) {
+    // This function is JIT compiled for M
+    // using the Hadamard matrix strings in `metal/hadamard.cpp`
+    hadamard_radix_m(x[r]);
+  }
+
+  // Write back to device
+  STEEL_PRAGMA_UNROLL
+  for (short c = 0; c < M; c++) {
+    STEEL_PRAGMA_UNROLL
+    for (short r = 0; r < read_width; r++) {
+      out[batch_idx + c * N + i * read_width + r] = x[r][c] * scale;
+    }
+  }
+}
--- a/mlx/backend/metal/kernels/quantized.h
+++ b/mlx/backend/metal/kernels/quantized.h
--- a/mlx/backend/metal/kernels/quantized.metal
+++ b/mlx/backend/metal/kernels/quantized.metal
--- a/mlx/backend/metal/kernels/reduction/ops.h
+++ b/mlx/backend/metal/kernels/reduction/ops.h
@@ -23,7 +23,7 @@ template <typename U = bool>
 struct And {
  bool simd_reduce(bool val) {
    return simd_all(val);
-  };
+  }

  static constexpr constant bool init = true;

@@ -61,7 +61,7 @@ template <typename U = bool>
 struct Or {
  bool simd_reduce(bool val) {
    return simd_any(val);
-  };
+  }

  static constexpr constant bool init = false;

@@ -100,7 +100,7 @@ struct Sum {
  template <typename T>
  T simd_reduce(T val) {
    return simd_sum(val);
-  };
+  }

  static constexpr constant U init = U(0);

@@ -120,7 +120,7 @@ struct Prod {
  template <typename T>
  T simd_reduce(T val) {
    return simd_product(val);
-  };
+  }

  static constexpr constant U init = U(1);

@@ -140,7 +140,7 @@ struct Min {
  template <typename T>
  T simd_reduce(T val) {
    return simd_min(val);
-  };
+  }

  static constexpr constant U init = Limits<U>::max;

@@ -160,7 +160,7 @@ struct Max {
  template <typename T>
  T simd_reduce(T val) {
    return simd_max(val);
-  };
+  }

  static constexpr constant U init = Limits<U>::min;

--- a/mlx/backend/metal/kernels/scaled_dot_product_attention.metal
+++ b/mlx/backend/metal/kernels/scaled_dot_product_attention.metal
@@ -1,9 +1,927 @@
 #include <metal_simdgroup>
 #include <metal_stdlib>

+#include "mlx/backend/metal/kernels/steel/defines.h"
+#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
 #include "mlx/backend/metal/kernels/scaled_dot_product_attention_params.h"
 using namespace metal;

+using namespace mlx::steel;
+
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short dst_ld,
+    short reduction_dim,
+    short tgp_size,
+    short alignment = 1,
+    short n_reads = (BCOLS * BROWS) / (tgp_size),
+    short TCOLS = BCOLS / n_reads,
+    short TROWS = tgp_size / TCOLS>
+struct BlockLoaderFA {
+  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
+  STEEL_CONST short vec_size = n_reads;
+
+  // Leading dimension for src
+  const int src_ld;
+  const int tile_stride;
+
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+
+  struct alignas(alignment * sizeof(T)) ReadVector {
+    uint8_t v[sizeof(T) * vec_size];
+  };
+
+  /* Constructor */
+  METAL_FUNC BlockLoaderFA(
+      const device T* src_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj) {}
+
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
+          *((const device ReadVector*)(&src[i * src_ld]));
+    }
+  }
+
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(short2 src_tile_dim) const {
+    src_tile_dim = src_tile_dim - short2(bj, bi);
+
+    // Skip loading if thread has no valid reads
+    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+      return;
+    }
+
+    // Use fast thread memory for bound checks
+    bool tmp_idx[vec_size];
+    T tmp_val[vec_size];
+
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      // Make sure tmp_idx only contains valid indices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
+      }
+
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
+      }
+
+      // Zero out uneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
+      }
+
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = tmp_val[j];
+      }
+    }
+  }
+
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    src += tile_stride;
+  }
+  METAL_FUNC void next(short n) {
+    src += n * tile_stride;
+  }
+};
+
+template <bool M_aligned, bool N_aligned, bool K_aligned>
+struct LoopAlignment {};
+
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    short lda_tgp,
+    short ldb_tgp,
+    typename AccumType = float,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct BlockMMAFA {
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TM_stride = 8 * WM;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TN_stride = 8 * WN;
+
+  // Warp tile size along M
+  STEEL_CONST short TM = BM / TM_stride;
+  // Warp tile size along N
+  STEEL_CONST short TN = BN / TN_stride;
+
+  // Strides of A, B along reduction axis
+  STEEL_CONST short simd_stride_a = {
+      transpose_a ? TM_stride : TM_stride * lda_tgp};
+  STEEL_CONST short simd_stride_b = {
+      transpose_b ? TN_stride * ldb_tgp : TN_stride};
+
+  // Jump between elements
+  STEEL_CONST short jump_a = {transpose_a ? lda_tgp : 1};
+  STEEL_CONST short jump_b = {transpose_b ? ldb_tgp : 1};
+
+  STEEL_CONST short tile_stride_a = {transpose_a ? 8 * lda_tgp : 8};
+  STEEL_CONST short tile_stride_b = {transpose_b ? 8 : 8 * ldb_tgp};
+
+  // Simdgroup matrices
+  simdgroup_matrix<AccumType, 8, 8> Asimd[TM];
+  simdgroup_matrix<AccumType, 8, 8> Bsimd[TN];
+  simdgroup_matrix<AccumType, 8, 8> results[TM * TN] = {
+      simdgroup_matrix<AccumType, 8, 8>(0)};
+
+  // Offsets within threadgroup
+  const short tm;
+  const short tn;
+
+  short sm;
+  short sn;
+
+  ushort sid;
+  ushort slid;
+
+  short As_offset;
+  short Bs_offset;
+
+  /* Constructor */
+  METAL_FUNC BlockMMAFA(
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : tm(8 * (simd_group_id / WN)), tn(8 * (simd_group_id % WN)) {
+    // Determine thread position in simdgroup matrix
+    short qid = simd_lane_id / 4;
+    slid = simd_lane_id;
+    sid = simd_group_id;
+
+    sm = (qid & 4) + (simd_lane_id / 2) % 4;
+    sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
+
+    // Determine thread and simdgroup offset
+    As_offset =
+        transpose_a ? ((sn)*lda_tgp + (tm + sm)) : ((sn) + (tm + sm) * lda_tgp);
+    Bs_offset =
+        transpose_b ? ((tn + sn) * ldb_tgp + (sm)) : ((sm)*ldb_tgp + (tn + sn));
+  }
+
+  /* (BM, BK) X (BK, BN) multiply accumulate function */
+  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
+    // Adjust for simdgroup and thread location
+    As += As_offset;
+    Bs += Bs_offset;
+
+    // Iterate over BK in blocks of 8
+    STEEL_PRAGMA_UNROLL
+    for (short kk = 0; kk < BK; kk += 8) {
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Load elements from threadgroup A as simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < TM; i++) {
+        Asimd[i].thread_elements()[0] =
+            static_cast<AccumType>(As[i * simd_stride_a + 0]);
+        Asimd[i].thread_elements()[1] =
+            static_cast<AccumType>(As[i * simd_stride_a + jump_a]);
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Load elements from threadgroup B as simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        Bsimd[j].thread_elements()[0] =
+            static_cast<AccumType>(Bs[j * simd_stride_b + 0]);
+        Bsimd[j].thread_elements()[1] =
+            static_cast<AccumType>(Bs[j * simd_stride_b + jump_b]);
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Multiply and accumulate into result simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < TM; i++) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < TN; j++) {
+          short j_serp = (i % 2) ? (TN - 1 - j) : j;
+
+          simdgroup_multiply_accumulate(
+              results[i * TN + j_serp],
+              Asimd[i],
+              Bsimd[j_serp],
+              results[i * TN + j_serp]);
+        }
+      }
+
+      // Progress to next simdgroup tile
+      As += tile_stride_a;
+      Bs += tile_stride_b;
+    }
+  }
+
+  METAL_FUNC void rescale_output(const threadgroup float* Corrections) {
+    // Loop over all simdgroup tiles
+
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      short row = sm + tm + i * TM_stride;
+      float scale_value = Corrections[row];
+
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread auto& accum = results[i * TN + j].thread_elements();
+        // int offset = (i * TM_stride) * ldc + (j * TN_stride);
+        accum[0] *= scale_value;
+        accum[1] *= scale_value;
+      }
+    }
+  }
+
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(device U* C, const int ldc) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + tn + sn;
+
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = results[i * TN + j].thread_elements();
+        int offset = (i * TM_stride) * ldc + (j * TN_stride);
+
+        // Apply epilogue
+        U outs[2] = {Epilogue::apply(accum[0]), Epilogue::apply(accum[1])};
+
+        // Write out C
+        C[offset] = outs[0];
+        C[offset + 1] = outs[1];
+      }
+    }
+  }
+
+  METAL_FUNC void store_result_to_tgp_memory(
+      threadgroup U* C,
+      const int ldc,
+      short2 dst_tile_dims) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn);
+    dst_tile_dims -= short2(tn + sn, sm + tm);
+
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = results[i * TN + j].thread_elements();
+          int offset = (i * TM_stride) * ldc + (j * TN_stride);
+
+          // Apply epilogue and output C
+          if (j * TN_stride < dst_tile_dims.x) {
+            C[offset] = Epilogue::apply(accum[0]);
+          }
+
+          if (j * TN_stride + 1 < dst_tile_dims.x) {
+            C[offset + 1] = Epilogue::apply(accum[1]);
+          }
+        }
+      }
+    }
+  }
+
+  METAL_FUNC void
+  store_result_safe(device U* C, const int ldc, short2 dst_tile_dims) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn);
+    dst_tile_dims -= short2(tn + sn, sm + tm);
+
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = results[i * TN + j].thread_elements();
+          int offset = (i * TM_stride) * ldc + (j * TN_stride);
+
+          // Apply epilogue and output C
+          if (j * TN_stride < dst_tile_dims.x) {
+            C[offset] = Epilogue::apply(accum[0]);
+          }
+
+          if (j * TN_stride + 1 < dst_tile_dims.x) {
+            C[offset + 1] = Epilogue::apply(accum[1]);
+          }
+        }
+      }
+    }
+  }
+
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn) * fdc;
+    D += (sm + tm) * ldd + tn + sn;
+
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = results[i * TN + j].thread_elements();
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+
+        // Apply epilogue
+        U outs[2] = {
+            epilogue_op.apply(accum[0], C[offset_c]),
+            epilogue_op.apply(accum[1], C[offset_c + fdc])};
+
+        // Write out D
+        D[offset_d] = outs[0];
+        D[offset_d + 1] = outs[1];
+      }
+    }
+  }
+
+  METAL_FUNC void store_result_safe(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn) * fdc;
+    D += (sm + tm) * ldd + tn + sn;
+    dst_tile_dims -= short2(tn + sn, sm + tm);
+
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = results[i * TN + j].thread_elements();
+          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+
+          // Apply epilogue and output C
+          if (j * TN_stride < dst_tile_dims.x) {
+            D[offset_d] = epilogue_op.apply(accum[0], C[offset_c]);
+          }
+
+          if (j * TN_stride + 1 < dst_tile_dims.x) {
+            D[offset_d + 1] = epilogue_op.apply(accum[1], C[offset_c + fdc]);
+          }
+        }
+      }
+    }
+  }
+
+  METAL_FUNC void clear_results() {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < TN; j++) {
+        results[i * TN + j] = simdgroup_matrix<AccumType, 8, 8>(0);
+      }
+    }
+  }
+};
+
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_q,
+    bool transpose_k,
+    bool transpose_v,
+    bool MN_aligned,
+    bool K_aligned,
+    typename AccumType = typename AccumHelper<T>::accum_type,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct FastAttentionKernel {
+  STEEL_CONST short tgp_padding = 16 / sizeof(T);
+  STEEL_CONST short float_padding = 16 / sizeof(float);
+  STEEL_CONST short tgp_mem_size_q =
+      transpose_q ? BK * (BM + tgp_padding) : BM * (BK + tgp_padding);
+  STEEL_CONST short tgp_mem_size_k =
+      transpose_k ? BK * (BN + tgp_padding) : BN * (BK + tgp_padding);
+  STEEL_CONST short tgp_mem_size_v =
+      transpose_v ? BK * (BN + tgp_padding) : BN * (BK + tgp_padding);
+  STEEL_CONST short tgp_mem_size_s = BM * (BN + tgp_padding);
+
+  // maxes, rowsums, rescale
+  STEEL_CONST short tgp_mem_size_corrections =
+      4 * (BM * sizeof(float) + float_padding);
+
+  STEEL_CONST bool share_kv_smem = transpose_k != transpose_v;
+
+  STEEL_CONST short tgp_mem_size = share_kv_smem
+      ? tgp_mem_size_q + tgp_mem_size_k + tgp_mem_size_s +
+          tgp_mem_size_corrections
+      : tgp_mem_size_q + tgp_mem_size_k + tgp_mem_size_s +
+          tgp_mem_size_corrections + tgp_mem_size_v;
+
+  STEEL_CONST short tgp_size = WM * WN * 32;
+
+  static_assert(transpose_q == false, "Expected Q not transposed.");
+  static_assert(transpose_k == true, "Expected K transposed.");
+  static_assert(transpose_v == false, "Expected V not transposed.");
+  static_assert(tgp_mem_size <= 32768, "Excessive tgp memory requested.");
+
+  using loader_q_t = BlockLoaderFA<
+      T,
+      transpose_q ? BK : BM,
+      transpose_q ? BM : BK,
+      transpose_q ? BM + tgp_padding : BK + tgp_padding,
+      !transpose_q,
+      tgp_size>;
+
+  using loader_k_t = BlockLoaderFA<
+      T,
+      transpose_k ? BN : BK,
+      transpose_k ? BK : BN,
+      transpose_k ? BK + tgp_padding : BN + tgp_padding,
+      transpose_k,
+      tgp_size>;
+
+  using loader_v_t = BlockLoaderFA<
+      T,
+      transpose_v ? BK : BN,
+      transpose_v ? BN : BK,
+      transpose_v ? BN + tgp_padding : BK + tgp_padding,
+      transpose_v,
+      tgp_size>;
+
+  using mma_qk_t = BlockMMAFA<
+      T,
+      U,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_q,
+      transpose_k,
+      transpose_q ? BM + tgp_padding : BK + tgp_padding,
+      transpose_k ? BK + tgp_padding : BN + tgp_padding,
+      AccumType,
+      Epilogue>;
+
+  using mma_sv_t = BlockMMAFA<
+      T,
+      U,
+      BM,
+      BK,
+      BN,
+      WM,
+      WN,
+      false,
+      transpose_v,
+      BN + tgp_padding,
+      BK + tgp_padding,
+      AccumType,
+      Epilogue>;
+
+  /* Main kernel function */
+  template <bool M_aligned, bool N_aligned, bool K_aligned_>
+  static METAL_FUNC void gemm_loop(
+      threadgroup T* As [[threadgroup(0)]],
+      threadgroup T* Bs [[threadgroup(1)]],
+      const int gemm_k_iterations,
+      thread loader_k_t& loader_b,
+      thread mma_qk_t& mma_op,
+      thread const short& tgp_bm,
+      thread const short& tgp_bn,
+      LoopAlignment<M_aligned, N_aligned, K_aligned_> l = {}) {
+    // Appease the compiler
+    (void)l;
+    (void)tgp_bm;
+
+    short2 tile_dims_B = transpose_k ? short2(BK, tgp_bn) : short2(tgp_bn, BK);
+
+    // not valid for gemm_k_iterations > 1 (so, BK == d_k)
+    for (int k = 0; k < gemm_k_iterations; k++) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      if (N_aligned) {
+        loader_b.load_unsafe();
+      } else {
+        loader_b.load_safe(tile_dims_B);
+      }
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+    }
+  }
+
+  static METAL_FUNC void initialize_corrections(
+      threadgroup float* C,
+      uint simd_lane_id,
+      uint simd_group_id) {
+    if (simd_group_id == 0) {
+      threadgroup float* maxes = C;
+      threadgroup float* sums = C + (BM + float_padding);
+      threadgroup float* o_rescale = sums + (BM + float_padding);
+      threadgroup float* output_rescale = o_rescale + (BM + float_padding);
+
+      if (simd_lane_id < BM) {
+        maxes[simd_lane_id] = -INFINITY; // m_i
+        sums[simd_lane_id] = 0.f; // l_i
+        o_rescale[simd_lane_id] = 1.f; // li * exp(mi - mi_new)
+        output_rescale[simd_lane_id] = 1.f; // 1.0 / l_i
+      }
+    }
+  }
+
+  static METAL_FUNC void rescale_ss(
+      threadgroup T* Ss,
+      threadgroup float* Corrections,
+      uint simd_group_id,
+      uint simd_lane_id,
+      short2 local_blocks,
+      float alpha) {
+    if (simd_group_id == 0) {
+      short row_offset = BM + float_padding;
+      threadgroup float* maxes = Corrections;
+      threadgroup float* sums = Corrections + row_offset;
+      threadgroup float* o_rescale = sums + row_offset;
+      threadgroup float* output_scales = o_rescale + row_offset;
+
+      if (simd_lane_id < uint(local_blocks.y)) {
+        float m_i_old = maxes[simd_lane_id];
+        float l_i_old = sums[simd_lane_id];
+
+        float m_i_new = m_i_old;
+        float l_i_new = l_i_old;
+
+        short offset = simd_lane_id * (BN + tgp_padding);
+
+        float m_ij = -INFINITY;
+
+        for (short j = 0; j < local_blocks.x; j++) {
+          float val = alpha * float(Ss[offset + j]);
+          m_ij = max(m_ij, val);
+        }
+
+        m_i_new = max(m_ij, m_i_new);
+
+        float rowsum = 0.f; // lij
+
+        for (short j = 0; j < local_blocks.x; j++) {
+          float val = alpha * float(Ss[offset + j]);
+          float P_i_j = exp(val - m_ij);
+          rowsum += P_i_j;
+          P_i_j = P_i_j * exp(m_ij - m_i_new);
+          Ss[offset + j] = T(P_i_j);
+        }
+
+        l_i_new =
+            exp(m_i_old - m_i_new) * l_i_old + exp(m_ij - m_i_new) * rowsum;
+        maxes[simd_lane_id] = m_i_new;
+        sums[simd_lane_id] = l_i_new;
+        float rescale = l_i_old * exp(m_i_old - m_i_new);
+        o_rescale[simd_lane_id] = rescale;
+        output_scales[simd_lane_id] = 1.0 / l_i_new;
+      }
+    }
+  }
+
+  /* Main kernel function */
+  static METAL_FUNC void run(
+      const device T* Q [[buffer(0)]],
+      const device T* K [[buffer(1)]],
+      const device T* V [[buffer(2)]],
+      device U* O [[buffer(3)]],
+      const constant MLXFastAttentionParams* params [[buffer(4)]],
+      threadgroup T* Qs [[threadgroup(0)]],
+      threadgroup T* Ks [[threadgroup(1)]],
+      threadgroup T* Ss [[threadgroup(2)]],
+      threadgroup T* Vs [[threadgroup(3)]],
+      threadgroup float* Corrections [[threadgroup(4)]],
+      uint simd_lane_id [[thread_index_in_simdgroup]],
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]]) {
+    // Pacifying compiler
+    (void)lid;
+
+    const int tid_y = ((tid.y) << params->swizzle_log) +
+        ((tid.x) & ((1 << params->swizzle_log) - 1));
+    const int tid_x = (tid.x) >> params->swizzle_log;
+
+    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
+      return;
+    }
+
+    threadgroup_barrier(mem_flags::mem_none);
+
+    // Find block in Q, O; and head in K, V.
+    const int c_row = tid_y * BM;
+
+    Q += transpose_q ? c_row : c_row * params->ldq;
+    thread loader_q_t loader_q(Q, params->ldq, Qs, simd_group_id, simd_lane_id);
+
+    short tgp_bm = min(BM, params->M - c_row);
+    short2 tile_dims_Q = transpose_q ? short2(tgp_bm, BK) : short2(BK, tgp_bm);
+
+    loader_q.load_safe(tile_dims_Q);
+
+    initialize_corrections(Corrections, simd_lane_id, simd_group_id);
+
+    O += c_row * params->ldo;
+
+    // Prepare threadgroup mma operation
+    thread mma_qk_t mma_qk_op(simd_group_id, simd_lane_id);
+    thread mma_sv_t mma_softmax_sv_op(simd_group_id, simd_lane_id);
+    thread loader_k_t loader_k(K, params->ldk, Ks, simd_group_id, simd_lane_id);
+    thread loader_v_t loader_v(V, params->ldv, Vs, simd_group_id, simd_lane_id);
+
+    for (short n_block = 0; n_block < params->gemm_n_iterations_aligned;
+         n_block++) {
+      short c_col = BN;
+
+      // Prepare threadgroup loading operations
+      short gemm_k_iterations = params->gemm_k_iterations_aligned;
+      short tgp_bn_qk = min(BN, params->N - c_col * n_block);
+      threadgroup_barrier(mem_flags::mem_none);
+
+      ///////////////////////////////////////////////////////////////////////////////
+      { // Loop over K - unaligned case
+
+        if (tgp_bm == BM && tgp_bn_qk == BN) {
+          gemm_loop<true, true, K_aligned>(
+              Qs,
+              Ks,
+              gemm_k_iterations,
+              loader_k,
+              mma_qk_op,
+              tgp_bm,
+              tgp_bn_qk);
+        } else if (tgp_bn_qk == BN) {
+          gemm_loop<false, true, K_aligned>(
+              Qs,
+              Ks,
+              gemm_k_iterations,
+              loader_k,
+              mma_qk_op,
+              tgp_bm,
+              tgp_bn_qk);
+
+        } else if (tgp_bm == BM) {
+          gemm_loop<true, false, K_aligned>(
+              Qs,
+              Ks,
+              gemm_k_iterations,
+              loader_k,
+              mma_qk_op,
+              tgp_bm,
+              tgp_bn_qk);
+
+        } else {
+          gemm_loop<false, false, K_aligned>(
+              Qs,
+              Ks,
+              gemm_k_iterations,
+              loader_k,
+              mma_qk_op,
+              tgp_bm,
+              tgp_bn_qk);
+        }
+      }
+
+      mma_qk_op.store_result_to_tgp_memory(
+          Ss, BN + tgp_padding, short2(BN, BM));
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      rescale_ss(
+          Ss,
+          Corrections,
+          simd_group_id,
+          simd_lane_id,
+          short2(tgp_bn_qk, tgp_bm),
+          params->alpha);
+
+      loader_v.load_safe(short2(BK, tgp_bn_qk));
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      threadgroup float* o_scales = Corrections + 2 * (BM + float_padding);
+      mma_softmax_sv_op.rescale_output(o_scales);
+
+      mma_softmax_sv_op.mma(Ss, Vs);
+
+      threadgroup float* final_output_scales =
+          Corrections + 3 * (BM + float_padding);
+
+      mma_softmax_sv_op.rescale_output(final_output_scales);
+
+      loader_v.next();
+      loader_k.next(BN);
+
+      mma_qk_op.clear_results();
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    mma_softmax_sv_op.store_result_safe(O, params->ldo, short2(BK, tgp_bm));
+  }
+};
+
+template <
+    typename T,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_q,
+    bool transpose_k,
+    bool transpose_v,
+    bool MN_aligned,
+    bool K_aligned>
+[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void attention(
+    const device T* Q [[buffer(0)]],
+    const device T* K [[buffer(1)]],
+    const device T* V [[buffer(2)]],
+    device T* O [[buffer(3)]],
+    const constant MLXFastAttentionParams* params [[buffer(4)]],
+    const constant int* batch_shape [[buffer(6)]],
+    const constant size_t* batch_strides [[buffer(7)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]) {
+  using attention_kernel = FastAttentionKernel<
+      T,
+      T,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_q,
+      transpose_k,
+      transpose_v,
+      MN_aligned,
+      K_aligned>;
+
+  // Adjust for batch
+  if (params->batch_ndim > 1) {
+    const constant size_t* Q_bstrides = batch_strides;
+    const constant size_t* KV_bstrides = batch_strides + params->batch_ndim;
+
+    ulong2 batch_offsets = elem_to_loc_broadcast(
+        tid.z, batch_shape, Q_bstrides, KV_bstrides, params->batch_ndim);
+
+    Q += batch_offsets.x;
+    K += batch_offsets.y;
+    V += batch_offsets.y;
+
+  } else {
+    Q += params->batch_stride_q * tid.z;
+    K += params->batch_stride_k * tid.z;
+    V += params->batch_stride_v * tid.z;
+  }
+
+  // same shape as input
+  O += params->batch_stride_o * tid.z;
+  threadgroup T Qs[attention_kernel::tgp_mem_size_q];
+  threadgroup T Ss[attention_kernel::tgp_mem_size_s];
+  threadgroup float Corrections[attention_kernel::tgp_mem_size_corrections];
+
+  if (attention_kernel::share_kv_smem) {
+    threadgroup T Ks[attention_kernel::tgp_mem_size_k];
+    threadgroup T* Vs = Ks; //[attention_kernel::tgp_mem_size_v];
+    attention_kernel::run(
+        Q,
+        K,
+        V,
+        O,
+        params,
+        Qs,
+        Ks,
+        Ss,
+        Vs,
+        Corrections,
+        simd_lane_id,
+        simd_group_id,
+        tid,
+        lid);
+  } else {
+    threadgroup T Ks[attention_kernel::tgp_mem_size_k];
+    threadgroup T Vs[attention_kernel::tgp_mem_size_v];
+    attention_kernel::run(
+        Q,
+        K,
+        V,
+        O,
+        params,
+        Qs,
+        Ks,
+        Ss,
+        Vs,
+        Corrections,
+        simd_lane_id,
+        simd_group_id,
+        tid,
+        lid);
+  }
+}
+
+#define instantiate_fast_inference_self_attention_kernel(                   \
+    itype, otype, bm, bn, bk, wm, wn)                                       \
+  template [[host_name("steel_gemm_attention_bm_" #bm "_bn_" #bn "_bk_" #bk \
+                       "_itype_" #itype)]] [[kernel]] void                  \
+  attention<itype, bm, bn, bk, wm, wn, false, true, false, false, true>(    \
+      const device itype* Q [[buffer(0)]],                                  \
+      const device itype* K [[buffer(1)]],                                  \
+      const device itype* V [[buffer(2)]],                                  \
+      device otype* O [[buffer(3)]],                                        \
+      const constant MLXFastAttentionParams* params [[buffer(4)]],          \
+      const constant int* batch_shape [[buffer(6)]],                        \
+      const constant size_t* batch_strides [[buffer(7)]],                   \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                      \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],                \
+      uint3 tid [[threadgroup_position_in_grid]],                           \
+      uint3 lid [[thread_position_in_threadgroup]]);
+
+instantiate_fast_inference_self_attention_kernel(
+    float,
+    float,
+    16,
+    16,
+    64,
+    2,
+    2);
+instantiate_fast_inference_self_attention_kernel(
+    float,
+    float,
+    16,
+    16,
+    128,
+    2,
+    2);
+instantiate_fast_inference_self_attention_kernel(half, half, 16, 16, 64, 2, 2);
+instantiate_fast_inference_self_attention_kernel(half, half, 16, 16, 128, 2, 2);
+
 template <
    typename T,
    typename T2,
--- a/mlx/backend/metal/kernels/scaled_dot_product_attention_params.h
+++ b/mlx/backend/metal/kernels/scaled_dot_product_attention_params.h
@@ -4,6 +4,34 @@

 #pragma once

+struct MLXFastAttentionParams {
+  const int M;
+  const int N;
+  const int K;
+
+  const int ldq; // ldq == ldo
+  const int ldk;
+  const int ldv;
+  const int lds;
+  const int ldo;
+
+  const int tiles_n;
+  const int tiles_m;
+
+  const int batch_stride_q;
+  const int batch_stride_k;
+  const int batch_stride_v;
+  const int batch_stride_o;
+
+  const int swizzle_log;
+  const int gemm_n_iterations_aligned;
+  const int gemm_k_iterations_aligned;
+  const int gemm_sv_m_block_iterations;
+
+  const int batch_ndim;
+  const float alpha;
+};
+
 struct MLXScaledDotProductAttentionParams {
  // Associated dimensions & transposition information
  const uint QUERY_SEQUENCE_LENGTH = 1;
--- a/mlx/backend/metal/kernels/scan.h
+++ b/mlx/backend/metal/kernels/scan.h
@@ -309,6 +309,7 @@ template <
        }
      }
    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Share the prefix
    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
--- a/mlx/backend/metal/kernels/scatter.h
+++ b/mlx/backend/metal/kernels/scatter.h
@@ -10,7 +10,10 @@ METAL_FUNC void scatter_1d_index_impl(
    device mlx_atomic<T>* out [[buffer(2)]],
    const constant int* out_shape [[buffer(3)]],
    const constant size_t* out_strides [[buffer(4)]],
-    const constant size_t& upd_size [[buffer(5)]],
+    const constant size_t& out_ndim [[buffer(5)]],
+    const constant int* upd_shape [[buffer(6)]],
+    const constant size_t& upd_ndim [[buffer(7)]],
+    const constant size_t& upd_size [[buffer(8)]],
    const thread array<const device IdxT*, NIDX>& idx_buffers,
    uint2 gid [[thread_position_in_grid]]) {
  Op op;
@@ -21,7 +24,14 @@ METAL_FUNC void scatter_1d_index_impl(
    out_idx += idx_val * out_strides[i];
  }

-  op.atomic_update(out, updates[gid.y * upd_size + gid.x], out_idx + gid.x);
+  if (upd_ndim > 1) {
+    auto out_offset = elem_to_loc(gid.x, upd_shape + 1, out_strides, out_ndim);
+    out_idx += out_offset;
+  } else {
+    out_idx += gid.x;
+  }
+
+  op.atomic_update(out, updates[gid.y * upd_size + gid.x], out_idx);
 }

 template <typename T, typename IdxT, typename Op, int NIDX>
--- a/mlx/backend/metal/kernels/sort.h
+++ b/mlx/backend/metal/kernels/sort.h
@@ -235,19 +235,21 @@ struct KernelMergeSort {
      const device T* inp,
      device U* out,
      const constant int& size_sorted_axis,
-      const constant int& stride_sorted_axis,
-      const constant int& stride_segment_axis,
+      const constant int& in_stride_sorted_axis,
+      const constant int& out_stride_sorted_axis,
+      const constant int& in_stride_segment_axis,
+      const constant int& out_stride_segment_axis,
      threadgroup val_t* tgp_vals,
      threadgroup idx_t* tgp_idxs,
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]]) {
    // tid.y tells us the segment index
-    inp += tid.y * stride_segment_axis;
-    out += tid.y * stride_segment_axis;
+    inp += tid.y * in_stride_segment_axis;
+    out += tid.y * out_stride_segment_axis;

    // Copy into threadgroup memory
    for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
-      tgp_vals[i] = i < size_sorted_axis ? inp[i * stride_sorted_axis]
+      tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]
                                         : val_t(CompareOp::init);
      if (ARG_SORT) {
        tgp_idxs[i] = i;
@@ -264,9 +266,9 @@ struct KernelMergeSort {
    // Write output
    for (int i = lid.x; i < size_sorted_axis; i += BLOCK_THREADS) {
      if (ARG_SORT) {
-        out[i * stride_sorted_axis] = tgp_idxs[i];
+        out[i * out_stride_sorted_axis] = tgp_idxs[i];
      } else {
-        out[i * stride_sorted_axis] = tgp_vals[i];
+        out[i * out_stride_sorted_axis] = tgp_vals[i];
      }
    }
  }
@@ -282,8 +284,10 @@ template <
    const device T* inp [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& stride_segment_axis [[buffer(4)]],
+    const constant int& in_stride_sorted_axis [[buffer(3)]],
+    const constant int& out_stride_sorted_axis [[buffer(4)]],
+    const constant int& in_stride_segment_axis [[buffer(5)]],
+    const constant int& out_stride_segment_axis [[buffer(6)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel =
@@ -298,8 +302,10 @@ template <
        inp,
        out,
        size_sorted_axis,
-        stride_sorted_axis,
-        stride_segment_axis,
+        in_stride_sorted_axis,
+        out_stride_sorted_axis,
+        in_stride_segment_axis,
+        out_stride_segment_axis,
        tgp_vals,
        tgp_idxs,
        tid,
@@ -310,8 +316,10 @@ template <
        inp,
        out,
        size_sorted_axis,
-        stride_sorted_axis,
-        stride_segment_axis,
+        in_stride_sorted_axis,
+        out_stride_sorted_axis,
+        in_stride_segment_axis,
+        out_stride_segment_axis,
        tgp_vals,
        nullptr,
        tid,
@@ -331,10 +339,12 @@ template <
    const device T* inp [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int& size_sorted_axis [[buffer(2)]],
-    const constant int& stride_sorted_axis [[buffer(3)]],
-    const constant int& nc_dim [[buffer(4)]],
-    const device int* nc_shape [[buffer(5)]],
-    const device size_t* nc_strides [[buffer(6)]],
+    const constant int& in_stride_sorted_axis [[buffer(3)]],
+    const constant int& out_stride_sorted_axis [[buffer(4)]],
+    const constant int& nc_dim [[buffer(5)]],
+    const device int* nc_shape [[buffer(6)]],
+    const device size_t* in_nc_strides [[buffer(7)]],
+    const device size_t* out_nc_strides [[buffer(8)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel =
@@ -342,9 +352,10 @@ template <
  using val_t = typename sort_kernel::val_t;
  using idx_t = typename sort_kernel::idx_t;

-  auto block_idx = elem_to_loc(tid.y, nc_shape, nc_strides, nc_dim);
-  inp += block_idx;
-  out += block_idx;
+  auto in_block_idx = elem_to_loc(tid.y, nc_shape, in_nc_strides, nc_dim);
+  auto out_block_idx = elem_to_loc(tid.y, nc_shape, out_nc_strides, nc_dim);
+  inp += in_block_idx;
+  out += out_block_idx;

  if (ARG_SORT) {
    threadgroup val_t tgp_vals[sort_kernel::N_PER_BLOCK];
@@ -353,7 +364,9 @@ template <
        inp,
        out,
        size_sorted_axis,
-        stride_sorted_axis,
+        in_stride_sorted_axis,
+        out_stride_sorted_axis,
+        zero_helper,
        zero_helper,
        tgp_vals,
        tgp_idxs,
@@ -365,7 +378,9 @@ template <
        inp,
        out,
        size_sorted_axis,
-        stride_sorted_axis,
+        in_stride_sorted_axis,
+        out_stride_sorted_axis,
+        zero_helper,
        zero_helper,
        tgp_vals,
        nullptr,
--- a/mlx/backend/metal/kernels/sort.metal
+++ b/mlx/backend/metal/kernels/sort.metal
@@ -10,28 +10,10 @@

 #define instantiate_block_sort(                                          \
    name, itname, itype, otname, otype, arg_sort, bn, tn)                \
-  template [[host_name("c" #name "_" #itname "_" #otname "_bn" #bn           \
-                             "_tn" #tn)]] [[kernel]] void                \
-  block_sort<itype, otype, arg_sort, bn, tn>(                            \
-      const device itype* inp [[buffer(0)]],                             \
-      device otype* out [[buffer(1)]],                                   \
-      const constant int& size_sorted_axis [[buffer(2)]],                \
-      const constant int& stride_sorted_axis [[buffer(3)]],              \
-      const constant int& stride_segment_axis [[buffer(4)]],             \
-      uint3 tid [[threadgroup_position_in_grid]],                        \
-      uint3 lid [[thread_position_in_threadgroup]]);                     \
-  template [[host_name("nc" #name "_" #itname "_" #otname "_bn" #bn "_tn" #tn \
-                             )]] [[kernel]] void                         \
-  block_sort_nc<itype, otype, arg_sort, bn, tn>(                         \
-      const device itype* inp [[buffer(0)]],                             \
-      device otype* out [[buffer(1)]],                                   \
-      const constant int& size_sorted_axis [[buffer(2)]],                \
-      const constant int& stride_sorted_axis [[buffer(3)]],              \
-      const constant int& nc_dim [[buffer(4)]],                          \
-      const device int* nc_shape [[buffer(5)]],                          \
-      const device size_t* nc_strides [[buffer(6)]],                     \
-      uint3 tid [[threadgroup_position_in_grid]],                        \
-      uint3 lid [[thread_position_in_threadgroup]]);
+  instantiate_kernel("c" #name "_" #itname "_" #otname "_bn" #bn "_tn" #tn, \
+                     block_sort, itype, otype, arg_sort, bn, tn) \
+  instantiate_kernel("nc" #name "_" #itname "_" #otname "_bn" #bn "_tn" #tn, \
+                     block_sort_nc, itype, otype, arg_sort, bn, tn)

 #define instantiate_arg_block_sort_base(itname, itype, bn, tn) \
  instantiate_block_sort(                                      \
@@ -69,43 +51,12 @@ instantiate_block_sort_long(int64, int64_t)

 #define instantiate_multi_block_sort(                                      \
    vtname, vtype, itname, itype, arg_sort, bn, tn)                        \
-  template [[host_name("sort_mbsort_" #vtname "_" #itname "_bn" #bn      \
-                       "_tn" #tn)]] [[kernel]] void                        \
-  mb_block_sort<vtype, itype, arg_sort, bn, tn>(                           \
-      const device vtype* inp [[buffer(0)]],                               \
-      device vtype* out_vals [[buffer(1)]],                                \
-      device itype* out_idxs [[buffer(2)]],                                \
-      const constant int& size_sorted_axis [[buffer(3)]],                  \
-      const constant int& stride_sorted_axis [[buffer(4)]],                \
-      const constant int& nc_dim [[buffer(5)]],                            \
-      const device int* nc_shape [[buffer(6)]],                            \
-      const device size_t* nc_strides [[buffer(7)]],                       \
-      uint3 tid [[threadgroup_position_in_grid]],                          \
-      uint3 lid [[thread_position_in_threadgroup]]);                       \
-  template [[host_name("partition_mbsort_" #vtname "_" #itname "_bn" #bn \
-                       "_tn" #tn)]] [[kernel]] void                        \
-  mb_block_partition<vtype, itype, arg_sort, bn, tn>(                      \
-      device itype * block_partitions [[buffer(0)]],                       \
-      const device vtype* dev_vals [[buffer(1)]],                          \
-      const device itype* dev_idxs [[buffer(2)]],                          \
-      const constant int& size_sorted_axis [[buffer(3)]],                  \
-      const constant int& merge_tiles [[buffer(4)]],                       \
-      uint3 tid [[threadgroup_position_in_grid]],                          \
-      uint3 lid [[thread_position_in_threadgroup]],                        \
-      uint3 tgp_dims [[threads_per_threadgroup]]);                         \
-  template [[host_name("merge_mbsort_" #vtname "_" #itname "_bn" #bn     \
-                       "_tn" #tn)]] [[kernel]] void                        \
-  mb_block_merge<vtype, itype, arg_sort, bn, tn>(                          \
-      const device itype* block_partitions [[buffer(0)]],                  \
-      const device vtype* dev_vals_in [[buffer(1)]],                       \
-      const device itype* dev_idxs_in [[buffer(2)]],                       \
-      device vtype* dev_vals_out [[buffer(3)]],                            \
-      device itype* dev_idxs_out [[buffer(4)]],                            \
-      const constant int& size_sorted_axis [[buffer(5)]],                  \
-      const constant int& merge_tiles [[buffer(6)]],                       \
-      const constant int& num_tiles [[buffer(7)]],                         \
-      uint3 tid [[threadgroup_position_in_grid]],                          \
-      uint3 lid [[thread_position_in_threadgroup]]);
+  instantiate_kernel("sort_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
+                     mb_block_sort, vtype, itype, arg_sort, bn, tn) \
+  instantiate_kernel("partition_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
+                     mb_block_partition, vtype, itype, arg_sort, bn, tn) \
+  instantiate_kernel("merge_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
+                     mb_block_merge, vtype, itype, arg_sort, bn, tn)

 #define instantiate_multi_block_sort_base(vtname, vtype) \
  instantiate_multi_block_sort(vtname, vtype, uint32, uint32_t, true, 512, 8)
--- a/mlx/backend/metal/kernels/ternary.metal
+++ b/mlx/backend/metal/kernels/ternary.metal
@@ -9,96 +9,28 @@
 #include "mlx/backend/metal/kernels/ternary_ops.h"
 #include "mlx/backend/metal/kernels/ternary.h"

-#define instantiate_ternary_v(name, type, op)                          \
-  template [[host_name("v_" name)]] [[kernel]] void ternary_v<type, op>( \
-      device const bool* a,                                            \
-      device const type* b,                                            \
-      device const type* c,                                            \
-      device type* d,                                                  \
-      uint index [[thread_position_in_grid]]);
+#define instantiate_ternary_all(op, tname, type)                  \
+  instantiate_kernel("v_" #op #tname, ternary_v, type, op)        \
+  instantiate_kernel("g_" #op #tname, ternary_g, type, op)        \
+  instantiate_kernel("g1_" #op #tname, ternary_g_nd1, type, op)   \
+  instantiate_kernel("g2_" #op #tname, ternary_g_nd2, type, op)   \
+  instantiate_kernel("g3_" #op #tname, ternary_g_nd3, type, op)   \
+  instantiate_kernel("g4_" #op #tname, ternary_g_nd, type, op, 4) \
+  instantiate_kernel("g5_" #op #tname, ternary_g_nd, type, op, 5)

-#define instantiate_ternary_g(name, type, op)                          \
-  template [[host_name("g_" name)]] [[kernel]] void ternary_g<type, op>( \
-      device const bool* a,                                            \
-      device const type* b,                                            \
-      device const type* c,                                            \
-      device type* d,                                                  \
-      constant const int* shape,                                       \
-      constant const size_t* a_strides,                                \
-      constant const size_t* b_strides,                                \
-      constant const size_t* c_strides,                                \
-      constant const int& ndim,                                        \
-      uint3 index [[thread_position_in_grid]],                         \
-      uint3 grid_dim [[threads_per_grid]]);
+#define instantiate_ternary_types(op)               \
+  instantiate_ternary_all(op, bool_, bool)          \
+  instantiate_ternary_all(op, uint8, uint8_t)       \
+  instantiate_ternary_all(op, uint16, uint16_t)     \
+  instantiate_ternary_all(op, uint32, uint32_t)     \
+  instantiate_ternary_all(op, uint64, uint64_t)     \
+  instantiate_ternary_all(op, int8, int8_t)         \
+  instantiate_ternary_all(op, int16, int16_t)       \
+  instantiate_ternary_all(op, int32, int32_t)       \
+  instantiate_ternary_all(op, int64, int64_t)       \
+  instantiate_ternary_all(op, float16, half)        \
+  instantiate_ternary_all(op, float32, float)       \
+  instantiate_ternary_all(op, bfloat16, bfloat16_t) \
+  instantiate_ternary_all(op, complex64, complex64_t) // clang-format on

-#define instantiate_ternary_g_dim(name, type, op, dims)  \
-  template [[host_name("g" #dims "_" name )]] [[kernel]] void \
-  ternary_g_nd<type, op, dims>(                       \
-      device const bool* a,                              \
-      device const type* b,                              \
-      device const type* c,                              \
-      device type* d,                                    \
-      constant const int shape[dims],                    \
-      constant const size_t a_strides[dims],             \
-      constant const size_t b_strides[dims],             \
-      constant const size_t c_strides[dims],             \
-      uint3 index [[thread_position_in_grid]],           \
-      uint3 grid_dim [[threads_per_grid]]);
-
-#define instantiate_ternary_g_nd(name, type, op)    \
-  template [[host_name("g1_" name)]] [[kernel]] void \
-  ternary_g_nd1<type, op>(                       \
-      device const bool* a,                         \
-      device const type* b,                         \
-      device const type* c,                         \
-      device type* d,                               \
-      constant const size_t& a_strides,             \
-      constant const size_t& b_strides,             \
-      constant const size_t& c_strides,             \
-      uint index [[thread_position_in_grid]]);      \
-  template [[host_name("g2_" name)]] [[kernel]] void \
-  ternary_g_nd2<type, op>(                       \
-      device const bool* a,                         \
-      device const type* b,                         \
-      device const type* c,                         \
-      device type* d,                               \
-      constant const size_t a_strides[2],           \
-      constant const size_t b_strides[2],           \
-      constant const size_t c_strides[2],           \
-      uint2 index [[thread_position_in_grid]],      \
-      uint2 grid_dim [[threads_per_grid]]);         \
-  template [[host_name("g3_" name)]] [[kernel]] void \
-  ternary_g_nd3<type, op>(                       \
-      device const bool* a,                         \
-      device const type* b,                         \
-      device const type* c,                         \
-      device type* d,                               \
-      constant const size_t a_strides[3],           \
-      constant const size_t b_strides[3],           \
-      constant const size_t c_strides[3],           \
-      uint3 index [[thread_position_in_grid]],      \
-      uint3 grid_dim [[threads_per_grid]]);         \
-  instantiate_ternary_g_dim(name, type, op, 4)      \
-  instantiate_ternary_g_dim(name, type, op, 5)
-
-#define instantiate_ternary_all(name, tname, type, op) \
-  instantiate_ternary_v(#name #tname, type, op)    \
-  instantiate_ternary_g(#name #tname, type, op)    \
-  instantiate_ternary_g_nd(#name #tname, type, op)
-
-#define instantiate_ternary_types(name, op)               \
-  instantiate_ternary_all(name, bool_, bool, op)          \
-  instantiate_ternary_all(name, uint8, uint8_t, op)       \
-  instantiate_ternary_all(name, uint16, uint16_t, op)     \
-  instantiate_ternary_all(name, uint32, uint32_t, op)     \
-  instantiate_ternary_all(name, uint64, uint64_t, op)     \
-  instantiate_ternary_all(name, int8, int8_t, op)         \
-  instantiate_ternary_all(name, int16, int16_t, op)       \
-  instantiate_ternary_all(name, int32, int32_t, op)       \
-  instantiate_ternary_all(name, int64, int64_t, op)       \
-  instantiate_ternary_all(name, float16, half, op)        \
-  instantiate_ternary_all(name, float32, float, op)       \
-  instantiate_ternary_all(name, bfloat16, bfloat16_t, op) \
-  instantiate_ternary_all(name, complex64, complex64_t, op) // clang-format on
-
-instantiate_ternary_types(select, Select)
+instantiate_ternary_types(Select)
--- a/mlx/backend/metal/kernels/unary.metal
+++ b/mlx/backend/metal/kernels/unary.metal
@@ -5,83 +5,68 @@
 #include "mlx/backend/metal/kernels/unary_ops.h"
 #include "mlx/backend/metal/kernels/unary.h"

-#define instantiate_unary_v(name, type, op)                       \
-  template [[host_name(name)]] [[kernel]] void unary_v<type, op>( \
-      device const type* in,                                      \
-      device type* out,                                           \
-      uint index [[thread_position_in_grid]]);
+#define instantiate_unary_all(op, tname, type)          \
+  instantiate_kernel("v" #op #tname, unary_v, type, op) \
+  instantiate_kernel("g" #op #tname, unary_g, type, op)

-#define instantiate_unary_g(name, type, op)                       \
-  template [[host_name(name)]] [[kernel]] void unary_g<type, op>( \
-      device const type* in,                                      \
-      device type* out,                                           \
-      device const int* in_shape,                                 \
-      device const size_t* in_strides,                            \
-      device const int& ndim,                                     \
-      uint index [[thread_position_in_grid]]);
+#define instantiate_unary_float(op)               \
+  instantiate_unary_all(op, float16, half)        \
+  instantiate_unary_all(op, float32, float)       \
+  instantiate_unary_all(op, bfloat16, bfloat16_t)

-#define instantiate_unary_all(name, tname, type, op) \
-  instantiate_unary_v("v" #name #tname, type, op)    \
-  instantiate_unary_g("g" #name #tname, type, op)
+#define instantiate_unary_types(op)           \
+  instantiate_unary_all(op, bool_, bool)      \
+  instantiate_unary_all(op, uint8, uint8_t)   \
+  instantiate_unary_all(op, uint16, uint16_t) \
+  instantiate_unary_all(op, uint32, uint32_t) \
+  instantiate_unary_all(op, uint64, uint64_t) \
+  instantiate_unary_all(op, int8, int8_t)     \
+  instantiate_unary_all(op, int16, int16_t)   \
+  instantiate_unary_all(op, int32, int32_t)   \
+  instantiate_unary_all(op, int64, int64_t)   \
+  instantiate_unary_float(op)

-#define instantiate_unary_float(name, op)               \
-  instantiate_unary_all(name, float16, half, op)        \
-  instantiate_unary_all(name, float32, float, op)       \
-  instantiate_unary_all(name, bfloat16, bfloat16_t, op)
+instantiate_unary_types(Abs)
+instantiate_unary_float(ArcCos)
+instantiate_unary_float(ArcCosh)
+instantiate_unary_float(ArcSin)
+instantiate_unary_float(ArcSinh)
+instantiate_unary_float(ArcTan)
+instantiate_unary_float(ArcTanh)
+instantiate_unary_types(Ceil)
+instantiate_unary_float(Cos)
+instantiate_unary_float(Cosh)
+instantiate_unary_float(Exp)
+instantiate_unary_float(Expm1)
+instantiate_unary_types(Floor)
+instantiate_unary_float(Log)
+instantiate_unary_float(Log2)
+instantiate_unary_float(Log10)
+instantiate_unary_float(Log1p)
+instantiate_unary_types(Negative)
+instantiate_unary_float(Sigmoid)
+instantiate_unary_float(Erf)
+instantiate_unary_float(ErfInv)
+instantiate_unary_types(Sign)
+instantiate_unary_float(Sin)
+instantiate_unary_float(Sinh)
+instantiate_unary_types(Square)
+instantiate_unary_float(Sqrt)
+instantiate_unary_float(Rsqrt)
+instantiate_unary_float(Tan)
+instantiate_unary_float(Tanh)
+instantiate_unary_float(Round)

-#define instantiate_unary_types(name, op)           \
-  instantiate_unary_all(name, bool_, bool, op)      \
-  instantiate_unary_all(name, uint8, uint8_t, op)   \
-  instantiate_unary_all(name, uint16, uint16_t, op) \
-  instantiate_unary_all(name, uint32, uint32_t, op) \
-  instantiate_unary_all(name, uint64, uint64_t, op) \
-  instantiate_unary_all(name, int8, int8_t, op)     \
-  instantiate_unary_all(name, int16, int16_t, op)   \
-  instantiate_unary_all(name, int32, int32_t, op)   \
-  instantiate_unary_all(name, int64, int64_t, op)   \
-  instantiate_unary_float(name, op)
+instantiate_unary_all(Abs, complex64, complex64_t)
+instantiate_unary_all(Conjugate, complex64, complex64_t)
+instantiate_unary_all(Cos, complex64, complex64_t)
+instantiate_unary_all(Cosh, complex64, complex64_t)
+instantiate_unary_all(Exp, complex64, complex64_t)
+instantiate_unary_all(Negative, complex64, complex64_t)
+instantiate_unary_all(Sin, complex64, complex64_t)
+instantiate_unary_all(Sinh, complex64, complex64_t)
+instantiate_unary_all(Tan, complex64, complex64_t)
+instantiate_unary_all(Tanh, complex64, complex64_t)
+instantiate_unary_all(Round, complex64, complex64_t)

-instantiate_unary_types(abs, Abs)
-instantiate_unary_float(arccos, ArcCos)
-instantiate_unary_float(arccosh, ArcCosh)
-instantiate_unary_float(arcsin, ArcSin)
-instantiate_unary_float(arcsinh, ArcSinh)
-instantiate_unary_float(arctan, ArcTan)
-instantiate_unary_float(arctanh, ArcTanh)
-instantiate_unary_types(ceil, Ceil)
-instantiate_unary_float(cos, Cos)
-instantiate_unary_float(cosh, Cosh)
-instantiate_unary_float(exp, Exp)
-instantiate_unary_float(expm1, Expm1)
-instantiate_unary_types(floor, Floor)
-instantiate_unary_float(log, Log)
-instantiate_unary_float(log2, Log2)
-instantiate_unary_float(log10, Log10)
-instantiate_unary_float(log1p, Log1p)
-instantiate_unary_types(neg, Negative)
-instantiate_unary_float(sigmoid, Sigmoid)
-instantiate_unary_float(erf, Erf)
-instantiate_unary_float(erfinv, ErfInv)
-instantiate_unary_types(sign, Sign)
-instantiate_unary_float(sin, Sin)
-instantiate_unary_float(sinh, Sinh)
-instantiate_unary_types(square, Square)
-instantiate_unary_float(sqrt, Sqrt)
-instantiate_unary_float(rsqrt, Rsqrt)
-instantiate_unary_float(tan, Tan)
-instantiate_unary_float(tanh, Tanh)
-instantiate_unary_float(round, Round)
-
-instantiate_unary_all(abs, complex64, complex64_t, Abs)
-instantiate_unary_all(conj, complex64, complex64_t, Conjugate)
-instantiate_unary_all(cos, complex64, complex64_t, Cos)
-instantiate_unary_all(cosh, complex64, complex64_t, Cosh)
-instantiate_unary_all(exp, complex64, complex64_t, Exp)
-instantiate_unary_all(neg, complex64, complex64_t, Negative)
-instantiate_unary_all(sin, complex64, complex64_t, Sin)
-instantiate_unary_all(sinh, complex64, complex64_t, Sinh)
-instantiate_unary_all(tan, complex64, complex64_t, Tan)
-instantiate_unary_all(tanh, complex64, complex64_t, Tanh)
-instantiate_unary_all(round, complex64, complex64_t, Round)
-
-instantiate_unary_all(lnot, bool_, bool, LogicalNot) // clang-format on
+instantiate_unary_all(LogicalNot, bool_, bool) // clang-format on
--- a/mlx/backend/metal/matmul.cpp
+++ b/mlx/backend/metal/matmul.cpp
@@ -786,38 +786,47 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

    // Determine dispatch kernel
    int tm = 4, tn = 4;
-    int bm, bn, n_out_per_tgp;
+    int sm = 1, sn = 32;
+    int bm = 1, bn = 1;
+    int n_out_per_tgp;
    std::ostringstream kname;

    if (transpose_mat) {
-      bm = 8;
-      bn = 8;
-      if (out_vector_len >= 24576) {
-        bn = 128;
-      } else if (out_vector_len >= 16384) {
-        bn = 64;
-      } else if (out_vector_len >= 8192) {
+      if (in_vector_len >= 8192 && out_vector_len >= 2048) {
+        sm = 4;
+        sn = 8;
+      } else {
+        sm = 8;
+        sn = 4;
+      }
+
+      if (out_vector_len >= 2048) {
        bn = 16;
+      } else if (out_vector_len >= 512) {
+        bn = 4;
+      } else {
+        bn = 2;
      }

      // Specialized kernel for very small outputs
      tn = out_vector_len < tn ? 1 : tn;

-      n_out_per_tgp = bn * tn;
+      n_out_per_tgp = bn * sn * tn;
      kname << "gemv_t_" << type_to_name(out);

    } else {
      bm = out_vector_len >= 4096 ? 8 : 4;
-      bn = 32;
+      sn = 32;

      // Specialized kernel for very small outputs
      tm = out_vector_len < tm ? 1 : tm;

-      n_out_per_tgp = bm * tm;
+      n_out_per_tgp = bm * sm * tm;
      kname << "gemv_" << type_to_name(out);
    }

-    kname << "_bm" << bm << "_bn" << bn << "_tm" << tm << "_tn" << tn;
+    kname << "_bm" << bm << "_bn" << bn << "_sm" << sm << "_sn" << sn << "_tm"
+          << tm << "_tn" << tn;
    kname << "_nc" << !contiguous_kernel << "_axpby0";

    // Encode and dispatch kernel
@@ -826,7 +835,7 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setComputePipelineState(kernel);

    int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
-    MTL::Size group_dims = MTL::Size(bn, bm, 1);
+    MTL::Size group_dims = MTL::Size(32, bn, bm);
    MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

    compute_encoder.set_input_array(mat, 0);
@@ -838,11 +847,9 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setBytes(&mat_ld, sizeof(int), 6);

    compute_encoder->setBytes(&batch_ndim, sizeof(int), 9);
-    compute_encoder->setBytes(batch_shape.data(), batch_ndim * sizeof(int), 10);
-    compute_encoder->setBytes(
-        batch_strides_vec.data(), batch_ndim * sizeof(size_t), 11);
-    compute_encoder->setBytes(
-        batch_strides_mat.data(), batch_ndim * sizeof(size_t), 12);
+    set_vector_bytes(compute_encoder, batch_shape, 10);
+    set_vector_bytes(compute_encoder, batch_strides_vec, 11);
+    set_vector_bytes(compute_encoder, batch_strides_mat, 12);

    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);

@@ -910,15 +917,19 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

+  int M = a_pre.shape(-2);
+  int N = b_pre.shape(-1);
+  int K = a_pre.shape(-1);
+
  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
-  auto check_transpose = [&copies, &s](const array& arr) {
+  auto check_transpose = [&copies, &s](const array& arr, bool is_vector) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
-    if (sty == 1) {
+    if (sty == 1 && (!is_vector || stx == arr.shape(-1))) {
      return std::make_tuple(false, stx, arr);
-    } else if (stx == 1) {
+    } else if (stx == 1 && (!is_vector || sty == arr.shape(-2))) {
      return std::make_tuple(true, sty, arr);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
@@ -929,12 +940,8 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
  };

-  auto [transpose_a, a_cols, a] = check_transpose(a_pre);
-  auto [transpose_b, b_cols, b] = check_transpose(b_pre);
-
-  int M = a.shape(-2);
-  int N = b.shape(-1);
-  int K = a.shape(-1);
+  auto [transpose_a, a_cols, a] = check_transpose(a_pre, M == 1);
+  auto [transpose_b, b_cols, b] = check_transpose(b_pre, N == 1);

  array c = c_pre;
  int ldc = c.strides()[c.ndim() - 2];
@@ -997,38 +1004,47 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {

    // Determine dispatch kernel
    int tm = 4, tn = 4;
-    int bm, bn, n_out_per_tgp;
+    int sm = 1, sn = 32;
+    int bm = 1, bn = 1;
+    int n_out_per_tgp;
    std::ostringstream kname;

    if (transpose_mat) {
-      bm = 8;
-      bn = 8;
-      if (out_vector_len >= 24576) {
-        bn = 128;
-      } else if (out_vector_len >= 16384) {
-        bn = 64;
-      } else if (out_vector_len >= 8192) {
+      if (in_vector_len >= 8192 && out_vector_len >= 2048) {
+        sm = 4;
+        sn = 8;
+      } else {
+        sm = 8;
+        sn = 4;
+      }
+
+      if (out_vector_len >= 2048) {
        bn = 16;
+      } else if (out_vector_len >= 512) {
+        bn = 4;
+      } else {
+        bn = 2;
      }

      // Specialized kernel for very small outputs
      tn = out_vector_len < tn ? 1 : tn;

-      n_out_per_tgp = bn * tn;
+      n_out_per_tgp = bn * sn * tn;
      kname << "gemv_t_" << type_to_name(out);

    } else {
      bm = out_vector_len >= 4096 ? 8 : 4;
-      bn = 32;
+      sn = 32;

      // Specialized kernel for very small outputs
      tm = out_vector_len < tm ? 1 : tm;

-      n_out_per_tgp = bm * tm;
+      n_out_per_tgp = bm * sm * tm;
      kname << "gemv_" << type_to_name(out);
    }

-    kname << "_bm" << bm << "_bn" << bn << "_tm" << tm << "_tn" << tn;
+    kname << "_bm" << bm << "_bn" << bn << "_sm" << sm << "_sn" << sn << "_tm"
+          << tm << "_tn" << tn;
    kname << "_nc" << !contiguous_kernel << "_axpby1";

    // Encode and dispatch kernel
@@ -1037,7 +1053,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setComputePipelineState(kernel);

    int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
-    MTL::Size group_dims = MTL::Size(bn, bm, 1);
+    MTL::Size group_dims = MTL::Size(32, bn, bm);
    MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

    compute_encoder.set_input_array(mat, 0);
@@ -1344,15 +1360,19 @@ void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

+  int M = a_pre.shape(-2);
+  int N = b_pre.shape(-1);
+  int K = a_pre.shape(-1);
+
  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
-  auto check_transpose = [&copies, &s](const array& arr) {
+  auto check_transpose = [&copies, &s](const array& arr, bool is_vector) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
-    if (sty == 1) {
+    if (sty == 1 && (!is_vector || stx == arr.shape(-1))) {
      return std::make_tuple(false, stx, arr);
-    } else if (stx == 1) {
+    } else if (stx == 1 && (!is_vector || sty == arr.shape(-2))) {
      return std::make_tuple(true, sty, arr);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
@@ -1363,33 +1383,38 @@ void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
  };

-  auto [transpose_a, a_cols, a] = check_transpose(a_pre);
-  auto [transpose_b, b_cols, b] = check_transpose(b_pre);
+  auto [transpose_a, a_cols, a] = check_transpose(a_pre, M == 1);
+  auto [transpose_b, b_cols, b] = check_transpose(b_pre, N == 1);

  int lda = a_cols;
  int ldb = b_cols;

-  int M = a.shape(-2);
-  int N = b.shape(-1);
-  int K = a.shape(-1);
-
  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions

  bool has_op_mask = inputs.size() > 3;
  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;

+  // Prepare kernel name
+  std::string out_mask_nm = has_out_mask ? type_to_name(inputs[2]) : "nomask";
+  std::string op_mask_nm = has_op_mask ? type_to_name(inputs.back()) : "nomask";
+
+  auto get_batch_dims = [](const auto& v) {
+    return decltype(v){v.begin(), v.end() - 2};
+  };
+
  std::vector<int> batch_shape{1};
+  std::vector<size_t> A_batch_stride{0};
+  std::vector<size_t> B_batch_stride{0};
+  std::vector<size_t> outmask_bstride{0};
+  std::vector<size_t> Amask_bstride{0};
+  std::vector<size_t> Bmask_bstride{0};
  size_t A_batch_str = 0;
  size_t B_batch_str = 0;

  std::vector<size_t> batch_strides;

  if (out.ndim() > 2) {
-    auto get_batch_dims = [](const auto& v) {
-      return decltype(v){v.begin(), v.end() - 2};
-    };
-
    std::vector<int> bshape{out.shape().begin(), out.shape().end() - 2};
    std::vector<std::vector<size_t>> bstrides;

@@ -1397,14 +1422,26 @@ void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
      bstrides.emplace_back(arr.strides().begin(), arr.strides().end() - 2);
    }

-    auto [bshape_c, bstrides_c] = collapse_contiguous_dims(bshape, bstrides);
-    batch_shape = bshape_c;
-    A_batch_str = bstrides_c[0].back();
-    B_batch_str = bstrides_c[1].back();
+    // auto [bshape_c, bstrides_c] = collapse_contiguous_dims(bshape, bstrides);
+    batch_shape = bshape;
+    A_batch_str = bstrides[0].back();
+    B_batch_str = bstrides[1].back();

-    for (auto& bstr : bstrides_c) {
+    for (auto& bstr : bstrides) {
      batch_strides.insert(batch_strides.end(), bstr.begin(), bstr.end());
    }
+
+    A_batch_stride = bstrides[0];
+    B_batch_stride = bstrides[1];
+
+    if (has_out_mask) {
+      outmask_bstride = bstrides[2];
+    }
+    if (has_op_mask) {
+      Amask_bstride = bstrides[has_out_mask + 2];
+      Bmask_bstride = bstrides[has_out_mask + 3];
+    }
+
  } else {
    batch_strides = std::vector<size_t>(inputs.size(), 0);
  }
@@ -1412,6 +1449,174 @@ void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  size_t matrix_stride_out = size_t(M) * N;
  size_t batch_size_out = out.size() / (matrix_stride_out);

+  /////////////////////////////////////////////////////////////////////////////
+  // Gemv specialization
+
+  // Route to gemv if needed
+  if (std::min(M, N) == 1) {
+    // Collect problem info
+    bool is_b_matrix = N != 1;
+
+    auto& mat = is_b_matrix ? b : a;
+    auto& vec = is_b_matrix ? a : b;
+    bool transpose_mat = is_b_matrix ? !transpose_b : transpose_a;
+    int in_vector_len = K;
+    int out_vector_len = is_b_matrix ? N : M;
+
+    int mat_cols = transpose_mat ? out_vector_len : in_vector_len;
+    int mat_rows = transpose_mat ? in_vector_len : out_vector_len;
+    int mat_ld = is_b_matrix ? b_cols : a_cols;
+
+    auto batch_strides_mat = is_b_matrix ? B_batch_stride : A_batch_stride;
+    auto batch_strides_vec = is_b_matrix ? A_batch_stride : B_batch_stride;
+
+    auto mask_bstrides_mat = is_b_matrix ? Bmask_bstride : Amask_bstride;
+    auto mask_bstrides_vec = is_b_matrix ? Amask_bstride : Bmask_bstride;
+
+    auto mat_mask_idx = int(has_out_mask) + (is_b_matrix ? 3 : 2);
+    auto vec_mask_idx = int(has_out_mask) + (is_b_matrix ? 2 : 3);
+
+    // Determine if inputs have simple batching / broadcasting
+    bool contiguous_kernel = (batch_shape.size() == 1);
+
+    int batch_ndim = batch_shape.size();
+
+    // Determine dispatch kernel
+    int tm = 4, tn = 4;
+    int sm = 1, sn = 32;
+    int bm = 1, bn = 1;
+    int n_out_per_tgp;
+    std::ostringstream kname;
+
+    if (transpose_mat) {
+      sm = 8;
+      sn = 4;
+      bm = 1;
+      bn = (block_size_ == 64 && out_vector_len >= 2048) ? 4 : 2;
+      tm = block_size_ == 32 ? 4 : 8;
+      tn = 4;
+
+      // Specialized kernel for very small outputs
+      tn = out_vector_len < tn ? 1 : tn;
+
+      n_out_per_tgp = bn * sn * tn;
+      kname << "gemv_t";
+
+    } else {
+      if (block_size_ == 32) {
+        sm = 4;
+        sn = 8;
+        bm = 2;
+      } else {
+        sm = 2;
+        sn = 16;
+        bm = out_vector_len >= 512 ? 4 : 2;
+      }
+
+      // Specialized kernel for very small outputs
+      tm = out_vector_len < tm ? 1 : tm;
+
+      n_out_per_tgp = bm * sm * tm;
+      kname << "gemv";
+    }
+
+    kname << "_outmask_" << out_mask_nm;
+    kname << "_opmask_" << op_mask_nm;
+    kname << "_" << type_to_name(out);
+    kname << "_bm" << bm << "_bn" << bn;
+    kname << "_sm" << sm << "_sn" << sn;
+    kname << "_tm" << tm << "_tn" << tn;
+    kname << "_nc" << !contiguous_kernel;
+
+    // Encode and dispatch kernel
+    auto& compute_encoder = d.get_command_encoder(s.index);
+    auto kernel = d.get_kernel(kname.str());
+    compute_encoder->setComputePipelineState(kernel);
+
+    int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
+    MTL::Size group_dims = MTL::Size(32, bn, bm);
+    MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);
+
+    // Get mask params
+    std::vector<int> mask_strides;
+    std::vector<size_t> mask_batch_strides;
+    if (has_out_mask) {
+      auto& out_mask = inputs[2];
+
+      if (transpose_mat) {
+        mask_strides.push_back(out_mask.strides(out.shape(-2) == 1 ? -1 : -2));
+        mask_strides.push_back(out_mask.strides(out.shape(-2) == 1 ? -2 : -1));
+      } else {
+        mask_strides.push_back(out_mask.strides(out.shape(-1) == 1 ? -1 : -2));
+        mask_strides.push_back(out_mask.strides(out.shape(-1) == 1 ? -2 : -1));
+      }
+
+      mask_batch_strides.insert(
+          mask_batch_strides.end(),
+          outmask_bstride.begin(),
+          outmask_bstride.end());
+
+      compute_encoder.set_input_array(out_mask, 20);
+    }
+
+    if (has_op_mask) {
+      auto& mat_mask = inputs[mat_mask_idx];
+
+      if (transpose_mat) {
+        mask_strides.push_back(mat_mask.strides(!is_b_matrix ? -2 : -1));
+        mask_strides.push_back(mat_mask.strides(!is_b_matrix ? -1 : -2));
+      } else {
+        mask_strides.push_back(mat_mask.strides(is_b_matrix ? -2 : -1));
+        mask_strides.push_back(mat_mask.strides(is_b_matrix ? -1 : -2));
+      }
+
+      mask_batch_strides.insert(
+          mask_batch_strides.end(),
+          mask_bstrides_mat.begin(),
+          mask_bstrides_mat.end());
+
+      compute_encoder.set_input_array(mat_mask, 21);
+
+      auto& vec_mask = inputs[vec_mask_idx];
+      if (transpose_mat) {
+        mask_strides.push_back(vec_mask.strides(vec.shape(-2) == 1 ? -1 : -2));
+        mask_strides.push_back(vec_mask.strides(vec.shape(-2) == 1 ? -2 : -1));
+      } else {
+        mask_strides.push_back(vec_mask.strides(vec.shape(-1) == 1 ? -1 : -2));
+        mask_strides.push_back(vec_mask.strides(vec.shape(-1) == 1 ? -2 : -1));
+      }
+
+      mask_batch_strides.insert(
+          mask_batch_strides.end(),
+          mask_bstrides_vec.begin(),
+          mask_bstrides_vec.end());
+
+      compute_encoder.set_input_array(vec_mask, 22);
+    }
+
+    // Get gemv params
+    compute_encoder.set_input_array(mat, 0);
+    compute_encoder.set_input_array(vec, 1);
+    compute_encoder.set_output_array(out, 3);
+
+    compute_encoder->setBytes(&in_vector_len, sizeof(int), 4);
+    compute_encoder->setBytes(&out_vector_len, sizeof(int), 5);
+    compute_encoder->setBytes(&mat_ld, sizeof(int), 6);
+    compute_encoder->setBytes(&batch_ndim, sizeof(int), 9);
+    set_vector_bytes(compute_encoder, batch_shape, 10);
+    set_vector_bytes(compute_encoder, batch_strides_vec, 11);
+    set_vector_bytes(compute_encoder, batch_strides_mat, 12);
+
+    set_vector_bytes(compute_encoder, mask_strides, 23);
+    set_vector_bytes(compute_encoder, mask_batch_strides, 24);
+
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+    return;
+  }
+
  /////////////////////////////////////////////////////////////////////////////
  // Regular kernel dispatch

@@ -1421,10 +1626,6 @@ void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  bool mn_aligned = M % bm == 0 && N % bn == 0;
  bool k_aligned = K % bk == 0;

-  // Prepare kernel name
-  std::string out_mask_nm = has_out_mask ? type_to_name(inputs[2]) : "nomask";
-  std::string op_mask_nm = has_op_mask ? type_to_name(inputs.back()) : "nomask";
-
  std::ostringstream kname;
  kname << "steel_gemm_block_outmask_" << out_mask_nm << "_opmask_"
        << op_mask_nm << "_" << (transpose_a ? 't' : 'n')
@@ -1554,15 +1755,19 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

+  int M = a_pre.shape(-2);
+  int N = b_pre.shape(-1);
+  int K = a_pre.shape(-1);
+
  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
-  auto check_transpose = [&copies, &s](const array& arr) {
+  auto check_transpose = [&copies, &s](const array& arr, bool is_vector) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
-    if (sty == 1) {
+    if (sty == 1 && (!is_vector || stx == arr.shape(-1))) {
      return std::make_tuple(false, stx, arr);
-    } else if (stx == 1) {
+    } else if (stx == 1 && (!is_vector || sty == arr.shape(-2))) {
      return std::make_tuple(true, sty, arr);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
@@ -1573,16 +1778,12 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    }
  };

-  auto [transpose_a, a_cols, a] = check_transpose(a_pre);
-  auto [transpose_b, b_cols, b] = check_transpose(b_pre);
+  auto [transpose_a, a_cols, a] = check_transpose(a_pre, M == 1);
+  auto [transpose_b, b_cols, b] = check_transpose(b_pre, N == 1);

  int lda = a_cols;
  int ldb = b_cols;

-  int M = a.shape(-2);
-  int N = b.shape(-1);
-  int K = a.shape(-1);
-
  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions

@@ -1673,38 +1874,47 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {

    // Determine dispatch kernel
    int tm = 4, tn = 4;
-    int bm, bn, n_out_per_tgp;
+    int sm = 1, sn = 32;
+    int bm = 1, bn = 1;
+    int n_out_per_tgp;
    std::ostringstream kname;

    if (transpose_mat) {
-      bm = 8;
-      bn = 8;
-      if (out_vector_len >= 24576) {
-        bn = 128;
-      } else if (out_vector_len >= 16384) {
-        bn = 64;
-      } else if (out_vector_len >= 8192) {
+      if (in_vector_len >= 8192 && out_vector_len >= 2048) {
+        sm = 4;
+        sn = 8;
+      } else {
+        sm = 8;
+        sn = 4;
+      }
+
+      if (out_vector_len >= 2048) {
        bn = 16;
+      } else if (out_vector_len >= 512) {
+        bn = 4;
+      } else {
+        bn = 2;
      }

      // Specialized kernel for very small outputs
      tn = out_vector_len < tn ? 1 : tn;

-      n_out_per_tgp = bn * tn;
-      kname << "gemv_t_bs_" << type_to_name(out);
+      n_out_per_tgp = bn * sn * tn;
+      kname << "gemv_t_gather_" << type_to_name(out);

    } else {
      bm = out_vector_len >= 4096 ? 8 : 4;
-      bn = 32;
+      sn = 32;

      // Specialized kernel for very small outputs
      tm = out_vector_len < tm ? 1 : tm;

-      n_out_per_tgp = bm * tm;
-      kname << "gemv_bs_" << type_to_name(out);
+      n_out_per_tgp = bm * sm * tm;
+      kname << "gemv_gather_" << type_to_name(out);
    }

-    kname << "_bm" << bm << "_bn" << bn << "_tm" << tm << "_tn" << tn;
+    kname << "_bm" << bm << "_bn" << bn << "_sm" << sm << "_sn" << sn << "_tm"
+          << tm << "_tn" << tn;

    // Encode and dispatch kernel
    auto& compute_encoder = d.get_command_encoder(s.index);
@@ -1712,7 +1922,7 @@ void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setComputePipelineState(kernel);

    int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
-    MTL::Size group_dims = MTL::Size(bn, bm, 1);
+    MTL::Size group_dims = MTL::Size(32, bn, bm);
    MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

    compute_encoder.set_input_array(mat, 0);
--- a/mlx/backend/metal/nojit_kernels.cpp
+++ b/mlx/backend/metal/nojit_kernels.cpp
@@ -15,30 +15,34 @@ MTL::ComputePipelineState* get_arange_kernel(
 MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array&) {
+    Dtype,
+    const std::string) {
  return d.get_kernel(kernel_name);
 }

 MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array&,
-    const array&) {
+    Dtype,
+    Dtype,
+    const std::string) {
  return d.get_kernel(kernel_name);
 }

 MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array&,
-    const array&) {
+    Dtype,
+    Dtype,
+    const std::string) {
  return d.get_kernel(kernel_name);
 }

 MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
-    const array&) {
+    Dtype,
+    const std::string) {
  return d.get_kernel(kernel_name);
 }

@@ -63,6 +67,7 @@ MTL::ComputePipelineState* get_scan_kernel(
    const std::string& kernel_name,
    bool,
    bool,
+    const std::string&,
    const array&,
    const array&) {
  return d.get_kernel(kernel_name);
@@ -190,4 +195,20 @@ MTL::ComputePipelineState* get_steel_conv_general_kernel(
  return d.get_kernel(kernel_name);
 }

+MTL::ComputePipelineState* get_fft_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const std::string&) {
+  return d.get_kernel(kernel_name, "mlx", hash_name, func_consts);
+}
+
+MTL::ComputePipelineState* get_quantized_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string&) {
+  return d.get_kernel(kernel_name);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/primitives.cpp
+++ b/mlx/backend/metal/primitives.cpp
@@ -7,6 +7,7 @@
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/slicing.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -163,37 +164,14 @@ void Broadcast::eval_gpu(const std::vector<array>& inputs, array& out) {
 }

 void Concatenate::eval_gpu(const std::vector<array>& inputs, array& out) {
-  std::vector<int> sizes;
-  sizes.push_back(0);
-  for (auto& p : inputs) {
-    sizes.push_back(p.shape(axis_));
-  }
-  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
-
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  auto strides = out.strides();
-  auto flags = out.flags();
-  flags.row_contiguous = false;
-  flags.col_contiguous = false;
-  flags.contiguous = false;
-  auto& d = metal::device(stream().device);
-  auto& compute_encoder = d.get_command_encoder(stream().index);
-  auto concurrent_ctx = compute_encoder.start_concurrent();
-  for (int i = 0; i < inputs.size(); i++) {
-    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
-    size_t data_offset = strides[axis_] * sizes[i];
-    out_slice.copy_shared_buffer(
-        out, strides, flags, out_slice.size(), data_offset);
-    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
-  }
+  concatenate_gpu(inputs, out, axis_, stream());
 }

 void Copy::eval_gpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
 }

-void CustomVJP::eval_gpu(
+void CustomTransforms::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  eval(inputs, outputs);
@@ -238,23 +216,7 @@ void Pad::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Padding value, input and output must be of the same type
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

-  // Fill output with val
-  copy_gpu(val, out, CopyType::Scalar, stream());
-
-  // Find offset for start of input values
-  size_t data_offset = 0;
-  for (int i = 0; i < axes_.size(); i++) {
-    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
-    data_offset += out.strides()[ax] * low_pad_size_[i];
-  }
-
-  // Extract slice from output where input will be pasted
-  array out_slice(in.shape(), out.dtype(), nullptr, {});
-  out_slice.copy_shared_buffer(
-      out, out.strides(), out.flags(), out_slice.size(), data_offset);
-
-  // Copy input values into the slice
-  copy_gpu_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
+  pad_gpu(in, val, out, axes_, low_pad_size_, stream());
 }

 void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -311,7 +273,18 @@ void Reshape::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);

  if (copy_necessary) {
-    copy_gpu(in, out, CopyType::General);
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    auto out_strides = make_contiguous_strides<size_t>(in.shape());
+    copy_gpu_inplace(
+        in,
+        out,
+        in.shape(),
+        in.strides(),
+        out_strides,
+        0,
+        0,
+        CopyType::General,
+        stream());
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -331,28 +304,7 @@ void Slice::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  auto& in = inputs[0];
-
-  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] = prepare_slice(in);
-
-  // Do copy if needed
-  if (copy_needed) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
-    copy_gpu_inplace(
-        /* const array& in = */ in,
-        /* array& out = */ out,
-        /* const std::vector<int>& data_shape = */ out.shape(),
-        /* const std::vector<stride_t>& i_strides = */ inp_strides,
-        /* const std::vector<stride_t>& o_strides = */ ostrides,
-        /* int64_t i_offset = */ data_offset,
-        /* int64_t o_offset = */ 0,
-        /* CopyType ctype = */ CopyType::General,
-        /* const Stream& s = */ stream());
-  } else {
-    std::vector<size_t> ostrides{inp_strides.begin(), inp_strides.end()};
-    shared_buffer_slice(in, ostrides, data_offset, out);
-  }
+  slice_gpu(in, out, start_indices_, strides_, stream());
 }

 void SliceUpdate::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -422,4 +374,35 @@ void Cholesky::eval_gpu(const std::vector<array>& inputs, array& out) {
      "[Cholesky::eval_gpu] Metal Cholesky decomposition NYI.");
 }

+void View::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& in = inputs[0];
+  auto ibytes = size_of(in.dtype());
+  auto obytes = size_of(out.dtype());
+  // Conditions for buffer copying (disjunction):
+  // - type size is the same
+  // - type size is smaller and the last axis is contiguous
+  // - the entire array is row contiguous
+  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
+      in.flags().row_contiguous) {
+    auto strides = in.strides();
+    for (int i = 0; i < strides.size() - 1; ++i) {
+      strides[i] *= ibytes;
+      strides[i] /= obytes;
+    }
+    out.copy_shared_buffer(
+        in, strides, in.flags(), in.data_size() * ibytes / obytes);
+  } else {
+    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
+    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
+    copy_gpu_inplace(in, tmp, CopyType::General, stream());
+
+    auto flags = out.flags();
+    flags.contiguous = true;
+    flags.row_contiguous = true;
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/quantized.cpp
+++ b/mlx/backend/metal/quantized.cpp
@@ -2,8 +2,10 @@

 #include <cassert>

+#include "mlx/backend/common/compiled.h"
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"

@@ -44,12 +46,15 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the fast qmv kernel that has no bounds checking
    if (B < 6 && O % 8 == 0 && D % 512 == 0 && D >= 512) {
      std::ostringstream kname;
-      kname << "qmv_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
-            << bits_ << "_fast";
+      auto type_string = get_type_string(x.dtype());
+      kname << "qmv_" << type_string << "_gs_" << group_size_ << "_b_" << bits_
+            << "_fast";

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "qmv_fast", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 8;
@@ -71,12 +76,14 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the qmv kernel
    else if (B < 6) {
      std::ostringstream kname;
-      kname << "qmv_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
-            << bits_;
+      auto type_string = get_type_string(x.dtype());
+      kname << "qmv_" << type_string << "_gs_" << group_size_ << "_b_" << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "qmv", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 8;
@@ -98,12 +105,16 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the qmm_t kernel
    else {
      std::ostringstream kname;
-      kname << "qmm_t_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
-            << bits_ << "_alN_" << std::boolalpha << ((O % 32) == 0);
+      std::string aligned_n = (O % 32) == 0 ? "true" : "false";
+      auto type_string = get_type_string(x.dtype());
+      kname << "qmm_t_" << type_string << "_gs_" << group_size_ << "_b_"
+            << bits_ << "_alN_" << aligned_n;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "qmm_t", type_string, group_size_, bits_, aligned_n);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int wn = 2;
@@ -129,12 +140,14 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the qvm kernel
    if (B < 4) {
      std::ostringstream kname;
-      kname << "qvm_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
-            << bits_;
+      auto type_string = get_type_string(x.dtype());
+      kname << "qvm_" << type_string << "_gs_" << group_size_ << "_b_" << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "qvm", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 64;
@@ -156,12 +169,15 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the qmm_n kernel
    else {
      std::ostringstream kname;
-      kname << "qmm_n_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
+      auto type_string = get_type_string(x.dtype());
+      kname << "qmm_n_" << type_string << "_gs_" << group_size_ << "_b_"
            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "qmm_n", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int wn = 2;
@@ -253,12 +269,15 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the fast bs_qmv kernel that has no bounds checking
    if (B < 6 && O % 8 == 0 && D % 512 == 0 && D >= 512) {
      std::ostringstream kname;
-      kname << "bs_qmv_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
+      auto type_string = get_type_string(x.dtype());
+      kname << "bs_qmv_" << type_string << "_gs_" << group_size_ << "_b_"
            << bits_ << "_fast";

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "bs_qmv_fast", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 8;
@@ -295,12 +314,15 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {

    else if (B < 6) {
      std::ostringstream kname;
-      kname << "bs_qmv_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
+      auto type_string = get_type_string(x.dtype());
+      kname << "bs_qmv_" << type_string << "_gs_" << group_size_ << "_b_"
            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "bs_qmv", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 8;
@@ -338,12 +360,16 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the bs_qmm_t
    else {
      std::ostringstream kname;
-      kname << "bs_qmm_t_" << type_to_name(out) << "_gs_" << group_size_
-            << "_b_" << bits_ << "_alN_" << std::boolalpha << ((O % 32) == 0);
+      std::string aligned_n = (O % 32) == 0 ? "true" : "false";
+      auto type_string = get_type_string(out.dtype());
+      kname << "bs_qmm_t_" << type_string << "_gs_" << group_size_ << "_b_"
+            << bits_ << "_alN_" << aligned_n;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "bs_qmm_t", type_string, group_size_, bits_, aligned_n);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int wn = 2;
@@ -385,12 +411,15 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to the bs_qvm kernel
    if (B < 4) {
      std::ostringstream kname;
-      kname << "bs_qvm_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
+      auto type_string = get_type_string(out.dtype());
+      kname << "bs_qvm_" << type_string << "_gs_" << group_size_ << "_b_"
            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "bs_qvm", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int bo = 64;
@@ -428,12 +457,15 @@ void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Route to bs_qmm_n
    else {
      std::ostringstream kname;
-      kname << "bs_qmm_n_" << type_to_name(out) << "_gs_" << group_size_
-            << "_b_" << bits_;
+      auto type_string = get_type_string(out.dtype());
+      kname << "bs_qmm_n_" << type_string << "_gs_" << group_size_ << "_b_"
+            << bits_;

      // Encode and dispatch kernel
      auto& compute_encoder = d.get_command_encoder(s.index);
-      auto kernel = d.get_kernel(kname.str());
+      auto template_def = get_template_definition(
+          kname.str(), "bs_qmm_n", type_string, group_size_, bits_);
+      auto kernel = get_quantized_kernel(d, kname.str(), template_def);
      compute_encoder->setComputePipelineState(kernel);

      int wn = 2;
--- a/mlx/backend/metal/scaled_dot_product_attention.cpp
+++ b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -19,6 +19,140 @@
 namespace mlx::core::fast {

 namespace {
+void sdpa_full_self_attention_metal(
+    const Stream& s,
+    metal::Device& d,
+    const array& q,
+    const array& k,
+    const array& v,
+    const float alpha,
+    array& out,
+    std::vector<array>& temporaries) {
+  std::ostringstream kname_self_attention;
+  kname_self_attention << "steel_gemm_attention_";
+
+  constexpr const int bm = 16;
+  constexpr const int bn = 16;
+  const int bk = q.shape(-1); // already forced to be 64 or 128
+
+  if (bk != 64 && bk != 128) {
+    throw std::runtime_error(
+        "[ScaledDotProductAttention::eval_gpu]: hidden dim: expected either 64, 128");
+  }
+
+  constexpr const int wm = 2;
+  constexpr const int wn = 2;
+
+  std::string delimiter = "_";
+
+  kname_self_attention << "bm_" + std::to_string(bm) + delimiter;
+  kname_self_attention << "bn_" + std::to_string(bn) + delimiter;
+  kname_self_attention << "bk_" + std::to_string(bk) + delimiter;
+
+  for (const auto& arr : {k, v, out}) {
+    if (arr.dtype() != q.dtype()) {
+      throw std::runtime_error(
+          "[ScaledDotProductAttention::eval_gpu]: expected matching dtypes for q,k,v,o");
+    }
+  }
+
+  if (q.dtype() == float32) {
+    kname_self_attention << "itype" + delimiter + "float";
+  } else if (q.dtype() == float16) {
+    kname_self_attention << "itype" + delimiter + "half";
+  } else {
+    throw std::runtime_error(
+        "[ScaledDotProductAttention::eval_gpu]: unexpected dtype found for queries: expected either float32 or float16.");
+  }
+
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto kernel = d.get_kernel(kname_self_attention.str());
+  compute_encoder->setComputePipelineState(kernel);
+
+  uint hidden_dim = q.shape(-1);
+  uint qseq = q.shape(-2);
+  uint qheads = q.shape(-3);
+
+  const uint64_t KV_sequence_length = k.shape(-2);
+  const uint query_sequence_length = q.shape(-2);
+  const uint n_q_heads = q.shape(1);
+  const uint n_kv_heads = k.shape(1);
+
+  const int M = q.shape(-2);
+  const int N = M;
+  const int K = q.shape(-1);
+  const size_t batch_size_out = q.shape(0) * q.shape(1);
+
+  const std::vector<int> batch_shape = {q.shape(0) * q.shape(1)};
+  const int dk = q.shape(-1);
+  const int ldq = dk;
+  const int ldk = dk;
+  const int ldv = dk;
+  const int lds = bn;
+  const int ldo = dk;
+
+  int tn = 1;
+  int tm = (M + bm - 1) / bm;
+
+  const int batch_stride_q = dk * query_sequence_length;
+  const int batch_stride_k = dk * query_sequence_length;
+  const int batch_stride_v = dk * query_sequence_length;
+  const int batch_stride_o = dk * query_sequence_length;
+  const int swizzle_log = 0;
+  const int gemm_n_iterations_aligned = (N + bn - 1) / bn;
+  const int gemm_k_iterations_aligned = (K + bk - 1) / bk;
+  const int gemm_sv_m_block_iterations = (M + bm - 1) / bm;
+  const int batch_ndim = int(batch_shape.size());
+
+  MLXFastAttentionParams params{
+      (int)M,
+      (int)N,
+      (int)K,
+      ldq,
+      ldk,
+      ldv,
+      lds,
+      ldo,
+      tn,
+      tm,
+      batch_stride_q,
+      batch_stride_k,
+      batch_stride_v,
+      batch_stride_o,
+      swizzle_log,
+      gemm_n_iterations_aligned,
+      gemm_k_iterations_aligned,
+      gemm_sv_m_block_iterations,
+      batch_ndim,
+      alpha};
+
+  const std::vector<size_t> batch_strides = {
+      (size_t)batch_stride_q,
+      (size_t)batch_stride_k,
+      (size_t)batch_stride_v,
+      (size_t)batch_stride_o};
+
+  compute_encoder.set_input_array(q, 0);
+  compute_encoder.set_input_array(k, 1);
+  compute_encoder.set_input_array(v, 2);
+  compute_encoder.set_output_array(out, 3);
+
+  compute_encoder->setBytes(&params, sizeof(MLXFastAttentionParams), 4);
+  compute_encoder->setBytes(
+      batch_shape.data(), sizeof(int) * batch_shape.size(), 6);
+
+  compute_encoder->setBytes(
+      batch_strides.data(), sizeof(size_t) * batch_strides.size(), 7);
+
+  MTL::Size grid_dims = MTL::Size(1, tm, batch_size_out);
+  MTL::Size group_dims = MTL::Size(32, wm, wn);
+
+  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [temporaries](MTL::CommandBuffer*) mutable { temporaries.clear(); });
+  return;
+}

 void sdpa_metal(
    const Stream& s,
@@ -170,6 +304,12 @@ void ScaledDotProductAttention::eval_gpu(
  auto v = check_transpose(v_pre);

  const int heads = q.shape(-3);
+
+  uint query_sequence_length = q.shape(-2);
+  if (query_sequence_length >= 16) {
+    return sdpa_full_self_attention_metal(
+        s, d, q, k, v, scale_, out, temporaries);
+  }
  int tile_size = 64;
  const int kv_seq_len = k.shape(-2);
  if (kv_seq_len > 8000) {
--- a/mlx/backend/metal/scan.cpp
+++ b/mlx/backend/metal/scan.cpp
@@ -38,22 +38,25 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
    kname << "reverse_";
  }
  kname << ((inclusive_) ? "inclusive_" : "exclusive_");
+
+  std::string reduce_type;
  switch (reduce_type_) {
    case Scan::Sum:
-      kname << "sum_";
+      reduce_type = "sum";
      break;
    case Scan::Prod:
-      kname << "prod_";
+      reduce_type = "prod";
      break;
    case Scan::Max:
-      kname << "max_";
+      reduce_type = "max";
      break;
    case Scan::Min:
-      kname << "min_";
+      reduce_type = "min";
      break;
  }
-  kname << type_to_name(in) << "_" << type_to_name(out);
-  auto kernel = get_scan_kernel(d, kname.str(), reverse_, inclusive_, in, out);
+  kname << reduce_type << "_" << type_to_name(in) << "_" << type_to_name(out);
+  auto kernel = get_scan_kernel(
+      d, kname.str(), reverse_, inclusive_, reduce_type, in, out);

  if (contiguous) {
    auto& compute_encoder = d.get_command_encoder(s.index);
@@ -65,16 +68,16 @@ void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {

    // Compute the thread grid
    int n_reads = (in.itemsize() <= 4) ? 4 : 2;
-    int elements_per_simd = n_reads * 32;
+    constexpr int simd_size = 32;
+    int elements_per_simd = n_reads * simd_size;
    int thread_groups = in.size() / size;
    int thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (size < n_reads * 1024) {
-      thread_group_size = ((size + elements_per_simd - 1) / elements_per_simd) *
-          elements_per_simd;
-    } else if (size < n_reads * 2048) {
+    if (size <= n_reads * 1024) {
      thread_group_size =
-          ((size / 2 + elements_per_simd - 1) / elements_per_simd) *
-          elements_per_simd;
+          ((size + elements_per_simd - 1) / elements_per_simd) * simd_size;
+    } else if (size <= n_reads * 2048) {
+      thread_group_size =
+          ((size / 2 + elements_per_simd - 1) / elements_per_simd) * simd_size;
    }
    thread_group_size = std::min(
        thread_group_size,
--- a/mlx/backend/metal/slicing.cpp
+++ b/mlx/backend/metal/slicing.cpp
@@ -0,0 +1,98 @@
+// Copyright © 2024 Apple Inc.
+
+#include <numeric>
+
+#include "mlx/backend/common/slicing.h"
+#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/device.h"
+
+namespace mlx::core {
+
+void slice_gpu(
+    const array& in,
+    array& out,
+    std::vector<int> start_indices,
+    std::vector<int> strides,
+    const Stream& s) {
+  // Calculate out strides, initial offset and if copy needs to be made
+  auto [copy_needed, data_offset, inp_strides] =
+      prepare_slice(in, start_indices, strides);
+
+  // Do copy if needed
+  if (copy_needed) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
+    copy_gpu_inplace(
+        /* const array& in = */ in,
+        /* array& out = */ out,
+        /* const std::vector<int>& data_shape = */ out.shape(),
+        /* const std::vector<stride_t>& i_strides = */ inp_strides,
+        /* const std::vector<stride_t>& o_strides = */ ostrides,
+        /* int64_t i_offset = */ data_offset,
+        /* int64_t o_offset = */ 0,
+        /* CopyType ctype = */ CopyType::General,
+        /* const Stream& s = */ s);
+  } else {
+    std::vector<size_t> ostrides{inp_strides.begin(), inp_strides.end()};
+    shared_buffer_slice(in, ostrides, data_offset, out);
+  }
+}
+
+void concatenate_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    int axis,
+    const Stream& s) {
+  std::vector<int> sizes;
+  sizes.push_back(0);
+  for (auto& p : inputs) {
+    sizes.push_back(p.shape(axis));
+  }
+  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto strides = out.strides();
+  auto flags = out.flags();
+  flags.row_contiguous = false;
+  flags.col_contiguous = false;
+  flags.contiguous = false;
+  auto& d = metal::device(s.device);
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto concurrent_ctx = compute_encoder.start_concurrent();
+  for (int i = 0; i < inputs.size(); i++) {
+    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
+    size_t data_offset = strides[axis] * sizes[i];
+    out_slice.copy_shared_buffer(
+        out, strides, flags, out_slice.size(), data_offset);
+    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
+  }
+}
+
+void pad_gpu(
+    const array& in,
+    const array& val,
+    array& out,
+    std::vector<int> axes,
+    std::vector<int> low_pad_size,
+    const Stream& s) {
+  // Fill output with val
+  copy_gpu(val, out, CopyType::Scalar, s);
+
+  // Find offset for start of input values
+  size_t data_offset = 0;
+  for (int i = 0; i < axes.size(); i++) {
+    auto ax = axes[i] < 0 ? out.ndim() + axes[i] : axes[i];
+    data_offset += out.strides()[ax] * low_pad_size[i];
+  }
+
+  // Extract slice from output where input will be pasted
+  array out_slice(in.shape(), out.dtype(), nullptr, {});
+  out_slice.copy_shared_buffer(
+      out, out.strides(), out.flags(), out_slice.size(), data_offset);
+
+  // Copy input values into the slice
+  copy_gpu_inplace(in, out_slice, CopyType::GeneralGeneral, s);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/slicing.h
+++ b/mlx/backend/metal/slicing.h
@@ -0,0 +1,30 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void slice_gpu(
+    const array& in,
+    array& out,
+    std::vector<int> start_indices,
+    std::vector<int> strides,
+    const Stream& s);
+
+void concatenate_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    int axis,
+    const Stream& s);
+
+void pad_gpu(
+    const array& in,
+    const array& val,
+    array& out,
+    std::vector<int> axes,
+    std::vector<int> low_pad_size,
+    const Stream& s);
+
+} // namespace mlx::core
--- a/mlx/backend/metal/sort.cpp
+++ b/mlx/backend/metal/sort.cpp
@@ -24,8 +24,11 @@ void single_block_sort(
  // Prepare shapes
  int n_rows = in.size() / in.shape(axis);

-  std::vector<size_t> nc_str = in.strides();
-  nc_str.erase(nc_str.begin() + axis);
+  std::vector<size_t> in_nc_str = in.strides();
+  in_nc_str.erase(in_nc_str.begin() + axis);
+
+  std::vector<size_t> out_nc_str = out.strides();
+  out_nc_str.erase(out_nc_str.begin() + axis);

  std::vector<int> nc_shape = in.shape();
  nc_shape.erase(nc_shape.begin() + axis);
@@ -33,21 +36,28 @@ void single_block_sort(
  int nc_dim = nc_shape.size();

  int size_sorted_axis = in.shape(axis);
-  int stride_sorted_axis = in.strides()[axis];
-  int stride_segment_axis = *std::min_element(nc_str.begin(), nc_str.end());
+  int in_stride_sorted_axis = in.strides()[axis];
+  int out_stride_sorted_axis = out.strides()[axis];
+  int in_stride_segment_axis =
+      *std::min_element(in_nc_str.begin(), in_nc_str.end());
+  int out_stride_segment_axis =
+      *std::min_element(out_nc_str.begin(), out_nc_str.end());

-  // Check if remaining strides are contiguous
-  bool contiguous_write = true;
-  if (axis != in.ndim() - 1 && axis != 0) {
-    for (int i = 0; i < nc_str.size() - 1; ++i) {
-      size_t expected = nc_str[i + 1] * nc_str[i + 1];
-      contiguous_write &= (nc_str[i] == expected);
-    }
-  }
+  // We can only use the contiguous kernel if the sorted axis
+  // has the largest or smallest stride.
+  // We also need the input to be contiguous
+  bool contiguous = in.flags().contiguous;
+  auto check_strides = [](array x, int sort_stride) {
+    int min_stride = *std::min_element(x.strides().begin(), x.strides().end());
+    int max_stride = *std::max_element(x.strides().begin(), x.strides().end());
+    return sort_stride == min_stride || sort_stride == max_stride;
+  };
+  contiguous &= check_strides(in, in_stride_sorted_axis);
+  contiguous &= check_strides(out, out_stride_sorted_axis);

  // Prepare kernel name
  std::ostringstream kname;
-  kname << (contiguous_write ? "c" : "nc");
+  kname << (contiguous ? "c" : "nc");
  if (argsort) {
    kname << "arg";
  }
@@ -64,14 +74,17 @@ void single_block_sort(
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  compute_encoder->setBytes(&size_sorted_axis, sizeof(int), 2);
-  compute_encoder->setBytes(&stride_sorted_axis, sizeof(int), 3);
+  compute_encoder->setBytes(&in_stride_sorted_axis, sizeof(int), 3);
+  compute_encoder->setBytes(&out_stride_sorted_axis, sizeof(int), 4);

-  if (contiguous_write) {
-    compute_encoder->setBytes(&stride_segment_axis, sizeof(int), 4);
+  if (contiguous) {
+    compute_encoder->setBytes(&in_stride_segment_axis, sizeof(int), 5);
+    compute_encoder->setBytes(&out_stride_segment_axis, sizeof(int), 6);
  } else {
-    compute_encoder->setBytes(&nc_dim, sizeof(int), 4);
-    compute_encoder->setBytes(nc_shape.data(), nc_dim * sizeof(int), 5);
-    compute_encoder->setBytes(nc_str.data(), nc_dim * sizeof(size_t), 6);
+    compute_encoder->setBytes(&nc_dim, sizeof(int), 5);
+    compute_encoder->setBytes(nc_shape.data(), nc_dim * sizeof(int), 6);
+    compute_encoder->setBytes(in_nc_str.data(), nc_dim * sizeof(size_t), 7);
+    compute_encoder->setBytes(out_nc_str.data(), nc_dim * sizeof(size_t), 8);
  }

  MTL::Size group_dims = MTL::Size(bn, 1, 1);
--- a/mlx/backend/metal/ternary.cpp
+++ b/mlx/backend/metal/ternary.cpp
@@ -10,16 +10,16 @@ namespace mlx::core {

 constexpr int MAX_TERNARY_SPECIALIZED_DIMS = 5;

-void ternary_op(
+void ternary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    const std::string op) {
+    const std::string op,
+    const Stream& s) {
  assert(inputs.size() == 3);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& c = inputs[2];
  TernaryOpType topt = get_ternary_op_type(a, b, c);
-  set_ternary_op_output_data(a, b, c, out, topt, true /* donate_with_move */);

  if (out.size() == 0) {
    return;
@@ -47,10 +47,9 @@ void ternary_op(
    kernel_name = kname.str();
  }

-  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);

-  auto kernel = get_ternary_kernel(d, kernel_name, out);
+  auto kernel = get_ternary_kernel(d, kernel_name, out.dtype(), op);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
@@ -101,8 +100,29 @@ void ternary_op(
  }
 }

+void ternary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s) {
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto& c = inputs[2];
+  TernaryOpType topt = get_ternary_op_type(a, b, c);
+  set_ternary_op_output_data(a, b, c, out, topt, true /* donate_with_move */);
+  ternary_op_gpu_inplace(inputs, out, op, s);
+}
+
+void ternary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op) {
+  auto& s = out.primitive().stream();
+  ternary_op_gpu(inputs, out, op, s);
+}
+
 void Select::eval_gpu(const std::vector<array>& inputs, array& out) {
-  ternary_op(inputs, out, "select");
+  ternary_op_gpu(inputs, out, get_primitive_string(this));
 }

 } // namespace mlx::core
--- a/mlx/backend/metal/ternary.h
+++ b/mlx/backend/metal/ternary.h
@@ -0,0 +1,21 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void ternary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s);
+
+void ternary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s);
+
+} // namespace mlx::core
--- a/mlx/backend/metal/unary.cpp
+++ b/mlx/backend/metal/unary.cpp
@@ -5,36 +5,28 @@
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"

+#define UNARY_GPU(func)                                               \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    unary_op_gpu(inputs, out, get_primitive_string(this));            \
+  }
+
 namespace mlx::core {

-void unary_op(
+void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
-    const std::string op) {
+    const std::string op,
+    const Stream& s) {
  auto& in = inputs[0];
  bool contig = in.flags().contiguous;
-  if (contig) {
-    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
-      out.move_shared_buffer(in);
-    } else {
-      out.set_data(
-          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-          in.data_size(),
-          in.strides(),
-          in.flags());
-    }
-  } else {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  }
  if (in.size() == 0) {
    return;
  }

-  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);

  std::string kernel_name = (contig ? "v" : "g") + op + type_to_name(out);
-  auto kernel = get_unary_kernel(d, kernel_name, out);
+  auto kernel = get_unary_kernel(d, kernel_name, out.dtype(), op);

  size_t nthreads = contig ? in.data_size() : in.size();
  MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
@@ -59,148 +51,88 @@ void unary_op(
  compute_encoder.dispatchThreads(grid_dims, group_dims);
 }

-void Abs::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "abs");
-}
-
-void ArcCos::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arccos");
-}
-
-void ArcCosh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arccosh");
-}
-
-void ArcSin::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arcsin");
-}
-
-void ArcSinh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arcsinh");
-}
-
-void ArcTan::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arctan");
-}
-
-void ArcTanh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "arctanh");
-}
-
-void Conjugate::eval_gpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  if (out.dtype() == complex64) {
-    unary_op(inputs, out, "conj");
+void unary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op,
+    const Stream& s) {
+  auto& in = inputs[0];
+  bool contig = in.flags().contiguous;
+  if (contig) {
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+      out.move_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
  } else {
-    throw std::invalid_argument(
-        "[conjugate] conjugate must be called on complex input.");
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
+  unary_op_gpu_inplace(inputs, out, op, s);
 }

-void Cos::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "cos");
+void unary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op) {
+  auto& s = out.primitive().stream();
+  unary_op_gpu(inputs, out, op, s);
 }

-void Cosh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "cosh");
-}
-
-void Erf::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "erf");
-}
-
-void ErfInv::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "erfinv");
-}
-
-void Exp::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "exp");
-}
-
-void Expm1::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "expm1");
-}
+UNARY_GPU(Abs)
+UNARY_GPU(ArcCos)
+UNARY_GPU(ArcCosh)
+UNARY_GPU(ArcSin)
+UNARY_GPU(ArcSinh)
+UNARY_GPU(ArcTan)
+UNARY_GPU(ArcTanh)
+UNARY_GPU(Conjugate)
+UNARY_GPU(Cos)
+UNARY_GPU(Cosh)
+UNARY_GPU(Erf)
+UNARY_GPU(ErfInv)
+UNARY_GPU(Exp)
+UNARY_GPU(Expm1)
+UNARY_GPU(Log1p)
+UNARY_GPU(LogicalNot)
+UNARY_GPU(Floor)
+UNARY_GPU(Ceil)
+UNARY_GPU(Negative)
+UNARY_GPU(Sigmoid)
+UNARY_GPU(Sign)
+UNARY_GPU(Sin)
+UNARY_GPU(Sinh)
+UNARY_GPU(Square)
+UNARY_GPU(Sqrt)
+UNARY_GPU(Tan)
+UNARY_GPU(Tanh)

 void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
  switch (base_) {
    case Base::e:
-      unary_op(inputs, out, "log");
+      unary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case Base::two:
-      unary_op(inputs, out, "log2");
+      unary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case Base::ten:
-      unary_op(inputs, out, "log10");
+      unary_op_gpu(inputs, out, get_primitive_string(this));
      break;
  }
 }

-void Log1p::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "log1p");
-}
-
-void LogicalNot::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "lnot");
-}
-
-void Floor::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "floor");
-}
-
-void Ceil::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "ceil");
-}
-
-void Negative::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "neg");
-}
-
 void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (issubdtype(in.dtype(), inexact)) {
-    unary_op(inputs, out, "round");
+    unary_op_gpu(inputs, out, get_primitive_string(this));
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
 }

-void Sigmoid::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "sigmoid");
-}
-
-void Sign::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "sign");
-}
-
-void Sin::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "sin");
-}
-
-void Sinh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "sinh");
-}
-
-void Square::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "square");
-}
-
-void Sqrt::eval_gpu(const std::vector<array>& inputs, array& out) {
-  if (recip_) {
-    unary_op(inputs, out, "rsqrt");
-  } else {
-    unary_op(inputs, out, "sqrt");
-  }
-}
-
-void Tan::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "tan");
-}
-
-void Tanh::eval_gpu(const std::vector<array>& inputs, array& out) {
-  unary_op(inputs, out, "tanh");
-}
-
 } // namespace mlx::core
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alex Barron	d0da74209b	version bump (#1260 )	2024-07-11 11:17:55 -07:00
Angelos Katharopoulos	5c1fa64fb0	Custom transforms (#1246 )	2024-07-10 18:00:01 -07:00
Alex Barron	a3c287354f	Fast Hadamard Transform (#1249 ) * Working hadamard for powers of 2 * working for m2^k add scale and check contiguity * add size check * clean up * fix test * add grads + vmap * gpu only * skip on linux * test typo * add cpu impl * remove gpu only tests * fix linux build + add is_equivalent	2024-07-09 20:39:01 -07:00
Angelos Katharopoulos	03cf033f82	Fix reshape copy bug (#1253 )	2024-07-07 21:37:00 -07:00
Alex Barron	bdb36c9a63	add zero vjps for bitwise ops and gather w.r.t. index (#1256 )	2024-07-07 21:34:59 -07:00
Awni Hannun	20bb301195	CPU binary reduction + Nits (#1242 ) * very minor nits * reduce binary * fix test	2024-06-28 13:50:42 -07:00
Awni Hannun	d6383a1c6a	version bump (#1239 )	2024-06-27 10:43:13 -07:00
Angelos Katharopoulos	b05bcfd27f	Fixes segfault when compiling checkpointed functions (#1235 )	2024-06-26 16:14:45 -07:00
Alex Barron	2615660e62	Fix strided sort bug (#1236 ) * Use output strides in sort kernel * fix zero strides bug	2024-06-26 14:32:11 -07:00
Awni Hannun	5b0af4cdb1	fix donation condition for compilation (#1237 )	2024-06-26 09:04:05 -07:00
Jagrit Digani	8c2e15e6c8	Accelerate import updates for iOS (#1227 ) * Update veclib and bnns includes to #include <Accelerate/Accelerate.h> for compatibility with ios * Mark float literals in softmax.cpp to be float16_t for errors in ios * Add arm neon vector operation guards * Redirect to common backend for consistency	2024-06-26 09:01:50 -07:00
Awni Hannun	56c8a33439	Get metal version from xcode (#1228 ) * get metal version from xcode * typo * fix	2024-06-26 07:02:11 -07:00
David Koski	4eef1e8a3e	fix typo (#1215 )	2024-06-24 13:36:35 -07:00
Alex Barron	95d11bda06	Fix NumPy 2.0 pickle test (#1221 ) * fix numpy version <2 temporarily * typo * better fix * Fix just for bfloat16 --------- Co-authored-by: Alex Barron <abarron22@apple.com>	2024-06-23 05:47:22 -07:00
Awni Hannun	af9079cc1f	version bump (#1212 )	2024-06-14 11:28:51 -07:00
Jagrit Digani	2d6cd47713	Masked gemv (#1211 )	2024-06-14 09:52:26 -07:00
Awni Hannun	fe3167d7ea	smaller CPU binary (#1203 ) * smaller CPU binary * fix no cpu build	2024-06-14 09:46:55 -07:00
Awni Hannun	31e134be35	Build for macOS 15 (#1208 ) * Build for macos 15 * metal32 as well * comment --------- Co-authored-by: Awni Hannun <Awni Hannun>	2024-06-13 13:31:44 -07:00
Awni Hannun	e84ba8056d	only allow openmpi (#1209 )	2024-06-13 12:14:44 -07:00
Fangjun Kuang	f20e97b092	minor fixes (#1194 ) * minor fixes * fix build errors	2024-06-12 22:06:49 -07:00
Alex Barron	934683088e	Refactor JIT for unary/binary/ternary ops (#1206 ) * refactor unary/binary/ternary ops * get_primitive_string util ---------	2024-06-12 14:22:12 -07:00
Awni Hannun	de2b9e7d0a	Fix kernel deps to reduce build times (#1205 )	2024-06-12 11:17:39 -07:00
Alex Barron	dd7d8e5e29	Add Quantized Ops to the JIT (#1204 ) * JIT for quantized ops * remove unused imports * address comments * fix imports * second attempt to fix imports --------- Co-authored-by: Alex Barron <abarron22@apple.com>	2024-06-12 09:47:12 -07:00
Awni Hannun	df964132fb	fix scatter + test (#1202 ) * fix scatter + test * fix test warnings * fix metal validation	2024-06-11 14:35:12 -07:00
Awni Hannun	709ccc6800	install mpi for release build (#1199 )	2024-06-10 10:09:32 -07:00
Awni Hannun	cf236fc390	version (#1191 )	2024-06-06 17:16:40 -07:00
Alex Barron	27d70c7d9d	Feature complete Metal FFT (#1102 ) * feature complete metal fft * fix contiguity bug * jit fft * simplify rader/bluestein constant computation * remove kernel/utils.h dep * remove bf16.h dep * format --------- Co-authored-by: Alex Barron <abarron22@apple.com>	2024-06-06 12:57:25 -07:00
nicolov	0e585b4409	Add docstring for scatter (#1189 ) * Add docstring for scatter * docs nits --------- Co-authored-by: Awni Hannun <awni@apple.com>	2024-06-06 11:51:25 -07:00
Angelos Katharopoulos	0163a8e57a	Add docs for the distributed namespace (#1184 )	2024-06-06 11:37:00 -07:00
Awni Hannun	578842954c	fix jit scan when output doesn't have primitive (#1190 )	2024-06-06 07:24:58 -07:00
Awni Hannun	496315fe1d	Fix scan (#1188 ) * fix scan * improve grid size * fix cpu cummax	2024-06-05 14:21:58 -07:00
Angelos Katharopoulos	0fe6895893	Fix the hard-shrink test (#1185 )	2024-06-04 16:22:56 -07:00
Nikhil Mehta	0b7d71fd2f	Add softmin, hardshrink, hardtanh (#1180 ) --------- Co-authored-by: Nikhil Mehta <nikmehta@tesla.com>	2024-06-04 15:48:18 -07:00
Awni Hannun	83b11bc58d	Fix Metal API validation for empty concat (#1183 )	2024-06-04 13:17:08 -07:00
Alex Barron	375a8bbdcc	Add some internal GPU apis (#1177 ) * Add unary/binary/ternay/slice/concat internal GPU ops * add pad internal op * formatting + no_cpu fix	2024-06-04 09:24:26 -07:00
Awni Hannun	ea9090bbc4	Add view op (#1179 ) * add view primitive * nit * fix view	2024-06-04 08:05:27 -07:00
nicolov	81def6ac76	Fix benchmark (#1175 )	2024-06-04 07:50:46 -07:00
Angelos Katharopoulos	3de8ce3f3c	In place all-reduce and forgiving init (#1178 )	2024-06-03 16:47:47 -07:00
Alex Barron	4d485fca24	Add defines include (#1176 ) Co-authored-by: Alex Barron <abarron22@apple.com>	2024-06-03 09:50:10 -07:00
Brian Keene	1865299a30	Metal shaders for memory efficient self attention on large sequences (#964 ) * Metal shaders for efficient self attention on large sequences Updated fast attention: GEMM-ified with Steel primitives Uses flash attention 1 for scale correction * more compiler silencing * Address rebase issues * Templatize kernel instantiation, revise cpu bindings * Safer writes to output * Permit batch size > 1 * Numerical fixes for sdpa self attention * Re-enable test, remove unused variable * add benchmarking script * Disable sdpa prior to perf tuning, and simplify tests for per-patch CI	2024-06-03 09:16:19 -07:00
Dominik Schlösser	3576b547c5	Doc error for default for scale in SinusoidalPositionalEncoding (#1174 )	2024-06-02 13:42:45 -07:00