Add async all reduce and donation

version bump (#1154 )
Option to JIT steel gemm / conv (#1139 )
2025-09-06 08:24:39 +08:00 · 2024-05-29 22:54:32 -07:00 · 2024-05-23 18:08:08 -07:00 · 2024-05-23 18:07:34 -07:00 · 2024-05-23 17:20:44 -07:00 · 2024-05-23 17:04:02 -07:00
200 changed files with 15127 additions and 7628 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -49,11 +49,6 @@ jobs:
          name: Run Python tests
          command: |
            python3 -m unittest discover python/tests -v
-      # TODO: Reenable when extension api becomes stable
-      # - run:
-      #     name: Build example extension
-      #     command: |
-      #       cd examples/extensions && python3 -m pip install . 
      - run:
          name: Build CPP only
          command: |
@@ -69,13 +64,14 @@ jobs:
        default: "15.2.0"
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.large.gen1
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            brew install python@3.8
+            brew install openmpi
            python3.8 -m venv env
            source env/bin/activate
            pip install --upgrade pip
@@ -101,11 +97,14 @@ jobs:
            source env/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
-      # TODO: Reenable when extension api becomes stable
-      # - run:
-      #     name: Build example extension
-      #     command: |
-      #       cd examples/extensions && python3.11 -m pip install . 
+            mpirun -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+      - run:
+          name: Build example extension
+          command: |
+            source env/bin/activate
+            cd examples/extensions
+            pip install -r requirements.txt
+            python setup.py build_ext -j8
      - store_test_results:
          path: test-results
      - run:
@@ -117,7 +116,13 @@ jobs:
          name: Run CPP tests
          command: |
            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
-            DEVICE=cpu ./build/tests/tests
+      - run:
+          name: Build small binary
+          command: |
+            source env/bin/activate
+            cd build/
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel -DBUILD_SHARED_LIBS=ON -DMLX_BUILD_CPU=OFF -DMLX_BUILD_SAFETENSORS=OFF -DMLX_BUILD_GGUF=OFF -DMLX_METAL_JIT=ON
+            make -j

  build_release:
    parameters:
@@ -132,7 +137,7 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.large.gen1
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -16,6 +16,7 @@ MLX was developed with contributions from the following individuals:
 - Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
 - Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
 - AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
+- Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,12 +15,16 @@ option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
 option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
+option(MLX_BUILD_CPU "Build cpu backend" ON)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
+option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
+option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
+option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.12.2)
+  set(MLX_VERSION 0.14.0)
 endif()

 # --------------------- Processor tests -------------------------
@@ -84,9 +88,11 @@ elseif (MLX_BUILD_METAL)
  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.2.diff)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
+    set(MLX_METAL_VERSION METAL_3_1)
  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.0.diff)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
+    set(MLX_METAL_VERSION METAL_3_0)
  else()
    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
  endif()
@@ -104,55 +110,66 @@ elseif (MLX_BUILD_METAL)
    $<INSTALL_INTERFACE:include/metal_cpp>
  )
  target_link_libraries(
-    mlx
+    mlx PUBLIC
    ${METAL_LIB}
    ${FOUNDATION_LIB}
    ${QUARTZ_LIB})
+
+  add_compile_definitions(${MLX_METAL_VERSION})
 endif()

-find_library(ACCELERATE_LIBRARY Accelerate)
-if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
-  message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
-  set(MLX_BUILD_ACCELERATE ON)
-  target_link_libraries(mlx ${ACCELERATE_LIBRARY})
-  add_compile_definitions(ACCELERATE_NEW_LAPACK)
+if (MLX_BUILD_CPU)
+  find_library(ACCELERATE_LIBRARY Accelerate)
+  if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
+    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
+    set(MLX_BUILD_ACCELERATE ON)
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
+    add_compile_definitions(ACCELERATE_NEW_LAPACK)
+  else()
+    message(STATUS "Accelerate or arm neon not found, using default backend.")
+    set(MLX_BUILD_ACCELERATE OFF)
+    if(${CMAKE_HOST_APPLE})
+      # The blas shipped in macOS SDK is not supported, search homebrew for
+      # openblas instead.
+      set(BLA_VENDOR OpenBLAS)
+      set(LAPACK_ROOT "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
+    endif()
+    # Search and link with lapack.
+    find_package(LAPACK REQUIRED)
+    if (NOT LAPACK_FOUND)
+      message(FATAL_ERROR "Must have LAPACK installed")
+    endif()
+    find_path(LAPACK_INCLUDE_DIRS lapacke.h
+      /usr/include
+      /usr/local/include
+      /usr/local/opt/openblas/include)
+    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
+    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
+    # List blas after lapack otherwise we may accidentally incldue an old version
+    # of lapack.h from the include dirs of blas.
+    find_package(BLAS REQUIRED)
+    if (NOT BLAS_FOUND)
+      message(FATAL_ERROR "Must have BLAS installed")
+    endif()
+    # TODO find a cleaner way to do this
+    find_path(BLAS_INCLUDE_DIRS cblas.h
+      /usr/include
+      /usr/local/include
+      $ENV{BLAS_HOME}/include)
+    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
+    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
+    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
+  endif()
 else()
-  message(STATUS "Accelerate or arm neon not found, using default backend.")
  set(MLX_BUILD_ACCELERATE OFF)
-  if(${CMAKE_HOST_APPLE})
-    # The blas shipped in macOS SDK is not supported, search homebrew for
-    # openblas instead.
-    set(BLA_VENDOR OpenBLAS)
-    set(LAPACK_ROOT "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
-  endif()
-  # Search and link with lapack.
-  find_package(LAPACK REQUIRED)
-  if (NOT LAPACK_FOUND)
-    message(FATAL_ERROR "Must have LAPACK installed")
-  endif()
-  find_path(LAPACK_INCLUDE_DIRS lapacke.h
-    /usr/include
-    /usr/local/include
-    /usr/local/opt/openblas/include)
-  message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
-  message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
-  target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
-  target_link_libraries(mlx ${LAPACK_LIBRARIES})
-  # List blas after lapack otherwise we may accidentally incldue an old version
-  # of lapack.h from the include dirs of blas.
-  find_package(BLAS REQUIRED)
-  if (NOT BLAS_FOUND)
-    message(FATAL_ERROR "Must have BLAS installed")
-  endif()
-  # TODO find a cleaner way to do this
-  find_path(BLAS_INCLUDE_DIRS cblas.h
-    /usr/include
-    /usr/local/include
-    $ENV{BLAS_HOME}/include)
-  message(STATUS "Blas lib " ${BLAS_LIBRARIES})
-  message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
-  target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-  target_link_libraries(mlx ${BLAS_LIBRARIES})
+endif()
+
+find_package(MPI)
+if (MPI_FOUND)
+    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)
@@ -164,6 +181,14 @@ target_include_directories(
  $<INSTALL_INTERFACE:include>
 )

+FetchContent_Declare(fmt
+  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+  GIT_TAG 10.2.1 
+  EXCLUDE_FROM_ALL
+)
+FetchContent_MakeAvailable(fmt)
+target_link_libraries(mlx PRIVATE fmt::fmt-header-only)
+
 if (MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
--- a/README.md
+++ b/README.md
@@ -88,13 +88,13 @@ for more information on building the C++ and Python APIs from source.

 ## Contributing 

-Check out the [contribution guidelines](CONTRIBUTING.md) for more information
+Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
 on contributing to MLX. See the
 [docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
 information on building from source, and running tests.

 We are grateful for all of [our
-contributors](ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
+contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
 to MLX and wish to be acknowledged, please add your name to the list in your
 pull request.

--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -28,11 +28,11 @@ def bench(f, a, b):
    return (e - s) * 1e-9


-def make_mx_conv_2D(strides=(1, 1), padding=(0, 0)):
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = mx.conv2d(a, b, stride=strides, padding=padding)
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys
@@ -40,12 +40,12 @@ def make_mx_conv_2D(strides=(1, 1), padding=(0, 0)):
    return mx_conv_2D


-def make_pt_conv_2D(strides=(1, 1), padding=(0, 0)):
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = torch.conv2d(a, b, stride=strides, padding=padding)
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys
@@ -53,11 +53,13 @@ def make_pt_conv_2D(strides=(1, 1), padding=(0, 0)):
    return pt_conv_2D


-def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):

    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
-    b_np = np.random.uniform(-scale, scale, (O, kH, kW, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)
@@ -67,15 +69,15 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):

    torch.mps.synchronize()

-    f_mx = make_mx_conv_2D(strides, padding)
-    f_pt = make_pt_conv_2D(strides, padding)
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

-    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv2d(
-        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)
@@ -84,7 +86,7 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
-            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch
@@ -95,35 +97,40 @@ if __name__ == "__main__":

    dtypes = ("float32",)
    shapes = (
-        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2)),
-        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2)),
-        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2)),
-        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2)),
-        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2)),
-        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2)),
-        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2)),
-        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2)),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2)),
-        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2)),
-        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2)),
-        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2)),
-        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2)),
-        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2)),
-        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2)),
-        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )

    for dtype in dtypes:
-        print("(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  diff%")
-        for N, H, W, C, kH, kW, O, strides, padding in shapes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
-                N, H, W, C, kH, kW, O, strides, padding, np_dtype
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
-                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {100. * diff:+5.2f}%"
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -1,5 +1,5 @@
-Developer Documentation
-=======================
+Custom Extensions in MLX
+========================

 You can extend MLX with custom operations on the CPU or GPU. This guide
 explains how to do that with a simple example.
@@ -494,7 +494,7 @@ below.
        auto kernel = d.get_kernel(kname.str(), "mlx_ext");

        // Prepare to encode kernel
-        auto compute_encoder = d.get_command_encoder(s.index);
+        auto& compute_encoder = d.get_command_encoder(s.index);
        compute_encoder->setComputePipelineState(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
@@ -503,11 +503,11 @@ below.
        size_t nelem = out.size();

        // Encode input arrays to kernel
-        set_array_buffer(compute_encoder, x, 0);
-        set_array_buffer(compute_encoder, y, 1);
+        compute_encoder.set_input_array(x, 0);
+        compute_encoder.set_input_array(y, 1);

        // Encode output arrays to kernel
-        set_array_buffer(compute_encoder, out, 2);
+        compute_encoder.set_output_array(out, 2);

        // Encode alpha and beta
        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -531,7 +531,7 @@ below.

        // Launch the grid with the given number of threads divided among
        // the given threadgroups
-        compute_encoder->dispatchThreads(grid_dims, group_dims);
+        compute_encoder.dispatchThreads(grid_dims, group_dims);
    }

 We can now call the :meth:`axpby` operation on both the CPU and the GPU!
@@ -825,7 +825,7 @@ Let's look at a simple script and its results:

    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
-    print(f"c correctness: {mx.all(c == 6.0).item()}")
+    print(f"c correct: {mx.all(c == 6.0).item()}")

 Output:

--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -153,11 +153,18 @@ should point to the path to the built metal library.
     - OFF
   * - MLX_BUILD_METAL
     - ON
+   * - MLX_BUILD_CPU
+     - ON
   * - MLX_BUILD_PYTHON_BINDINGS
     - OFF
   * - MLX_METAL_DEBUG
     - OFF
-
+   * - MLX_BUILD_SAFETENSORS
+     - ON
+   * - MLX_BUILD_GGUF
+     - ON
+   * - MLX_METAL_JIT
+     - OFF

 .. note::

@@ -176,10 +183,37 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

+Binary Size Minimization
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To produce a smaller binary use the CMake flags `CMAKE_BUILD_TYPE=MinSizeRel`
+and `BUILD_SHARED_LIBS=ON`.
+
+The MLX CMake build has several additional options to make smaller binaries.
+For example, if you don't need the CPU backend or support for safetensors and
+GGUF, you can do:
+
+.. code-block:: shell
+
+  cmake ..
+    -DCMAKE_BUILD_TYPE=MinSizeRel \
+    -DBUILD_SHARED_LIBS=ON \
+    -DMLX_BUILD_CPU=OFF \
+    -DMLX_BUILD_SAFETENSORS=OFF \
+    -DMLX_BUILD_GGUF=OFF \
+    -DMLX_METAL_JIT=ON
+
+THE `MLX_METAL_JIT` flag minimizes the size of the MLX Metal library which
+contains pre-built GPU kernels. This substantially reduces the size of the
+Metal library by run-time compiling kernels the first time they are used in MLX
+on a given machine. Note run-time compilation incurs a cold-start cost which can
+be anwywhere from a few hundred millisecond to a few seconds depending on the
+application. Once a kernel is compiled, it will be cached by the system. The
+Metal kernel cache persists accross reboots.
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

-
 Metal not found
 ~~~~~~~~~~~~~~~

--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -8,5 +8,8 @@ Linear Algebra
 .. autosummary:: 
   :toctree: _autosummary 

+    inv
    norm
+    cholesky
    qr
+    svd
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -15,6 +15,7 @@ Layers
   BatchNorm
   Conv1d
   Conv2d
+   Conv3d
   Dropout
   Dropout2d
   Dropout3d
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -10,6 +10,7 @@ Operations

   abs
   add
+   addmm
   all
   allclose
   any
@@ -19,12 +20,14 @@ Operations
   arcsin
   arcsinh
   arctan
+   arctan2
   arctanh
   argmax
   argmin
   argpartition
   argsort
   array_equal
+   as_strided
   atleast_1d
   atleast_2d
   atleast_3d
@@ -32,11 +35,12 @@ Operations
   bitwise_or
   bitwise_xor
   block_masked_mm
-   block_sparse_mm
   broadcast_to
   ceil
   clip
   concatenate
+   conj
+   conjugate
   convolve
   conv1d
   conv2d
@@ -64,6 +68,8 @@ Operations
   floor
   floor_divide
   full
+   gather_mm
+   gather_qmm
   greater
   greater_equal
   identity
@@ -73,6 +79,7 @@ Operations
   isnan
   isneginf
   isposinf
+   issubdtype
   left_shift
   less
   less_equal
@@ -103,11 +110,13 @@ Operations
   outer
   partition
   pad
+   power
   prod
   quantize
   quantized_matmul
   radians
   reciprocal
+   remainder
   repeat
   reshape
   right_shift
@@ -141,6 +150,7 @@ Operations
   tensordot
   tile
   topk
+   trace
   transpose
   tri
   tril
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -9,3 +9,4 @@ build_example(tutorial.cpp)
 build_example(linear_regression.cpp)
 build_example(logistic_regression.cpp)
 build_example(metal_capture.cpp)
+build_example(distributed.cpp)
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -0,0 +1,22 @@
+// Copyright © 2024 Apple Inc.
+
+#include <iostream>
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+int main() {
+  if (!distributed::is_available()) {
+    std::cout << "No communication backend found" << std::endl;
+    return 1;
+  }
+
+  auto global_group = distributed::init();
+  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;
+
+  array x = ones({10});
+  array out = distributed::all_reduce_sum(x, global_group);
+
+  std::cout << out << std::endl;
+}
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -89,8 +89,8 @@ void automatic_differentiation() {
  // dfdx is 2 * x

  // Get the second derivative by composing grad with grad
-  auto df2dx2 = grad(grad(fn))(x);
-  // df2dx2 is 2
+  auto d2fdx2 = grad(grad(fn))(x);
+  // d2fdx2 is 2
 }

 int main() {
--- a/examples/extensions/README.md
+++ b/examples/extensions/README.md
@@ -1,5 +1,5 @@

-## Build the extensions
+## Build

 ```
 pip install -e .
@@ -16,3 +16,9 @@ And then run:
 ```
 python setup.py build_ext -j8 --inplace
 ```
+
+## Test
+
+```
+python test.py
+```
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -257,7 +257,7 @@ void Axpby::eval_gpu(
  auto kernel = d.get_kernel(kname.str(), "mlx_ext");

  // Prepare to encode kernel
-  auto compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);

  // Kernel parameters are registered with buffer indices corresponding to
@@ -266,11 +266,11 @@ void Axpby::eval_gpu(
  size_t nelem = out.size();

  // Encode input arrays to kernel
-  set_array_buffer(compute_encoder, x, 0);
-  set_array_buffer(compute_encoder, y, 1);
+  compute_encoder.set_input_array(x, 0);
+  compute_encoder.set_input_array(y, 1);

  // Encode output arrays to kernel
-  set_array_buffer(compute_encoder, out, 2);
+  compute_encoder.set_output_array(out, 2);

  // Encode alpha and beta
  compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -296,7 +296,7 @@ void Axpby::eval_gpu(

  // Launch the grid with the given number of threads divided among
  // the given threadgroups
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);
 }

 #else // Metal is not available
--- a/examples/extensions/mlx_sample_extensions/init.py
+++ b/examples/extensions/mlx_sample_extensions/init.py
@@ -2,4 +2,4 @@

 import mlx.core as mx

-from .mlx_sample_extensions import *
+from ._ext import axpby
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.24
 mlx>=0.9.0
-nanobind@git+https://github.com/wjakob/nanobind.git#egg=4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+nanobind@git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
--- a/examples/extensions/test.py
+++ b/examples/extensions/test.py
@@ -0,0 +1,10 @@
+import mlx.core as mx
+from mlx_sample_extensions import axpby
+
+a = mx.ones((3, 4))
+b = mx.ones((3, 4))
+c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+
+print(f"c shape: {c.shape}")
+print(f"c dtype: {c.dtype}")
+print(f"c correct: {mx.all(c == 6.0).item()}")
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -19,11 +19,17 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h
 )

-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
+if (MLX_BUILD_CPU)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
+endif()
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
-else()
+elseif(MLX_BUILD_CPU)
  target_sources(
    mlx
    PRIVATE
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -252,8 +252,9 @@ class array {
  }

  /** True indicates the arrays buffer is safe to reuse */
-  bool is_donatable() const {
-    return array_desc_.use_count() == 1 && (array_desc_->data.use_count() == 1);
+  bool is_donatable(int known_instances = 1) const {
+    return array_desc_.use_count() == known_instances &&
+        (array_desc_->data.use_count() == 1);
  }

  /** The array's siblings. */
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -32,10 +32,10 @@ DEFAULT(ArgReduce)
 DEFAULT(ArgSort)
 DEFAULT(AsStrided)
 DEFAULT(BlockMaskedMM)
-DEFAULT(BlockSparseMM)
 DEFAULT(Broadcast)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
+DEFAULT(Conjugate)
 DEFAULT(Copy)
 DEFAULT_MULTI(CustomVJP)
 DEFAULT_MULTI(Depends)
@@ -47,6 +47,8 @@ DEFAULT(ErfInv)
 DEFAULT(FFT)
 DEFAULT(Floor)
 DEFAULT(Gather)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
 DEFAULT(Less)
@@ -78,6 +80,7 @@ DEFAULT(StopGradient)
 DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
+DEFAULT(Cholesky)

 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
@@ -193,6 +196,26 @@ void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

+void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  if (out.dtype() == float32 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    if (a.is_donatable()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
+    int size = a.data_size();
+    vvatan2f(out.data<float>(), a.data<float>(), b.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
 void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -37,6 +37,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
@@ -55,6 +56,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )

--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -293,4 +293,25 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

+void ArcTan2::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  if (out.dtype() == float32) {
+    binary_op<float>(a, b, out, detail::ArcTan2());
+  } else if (out.dtype() == float16) {
+    binary_op<float16_t>(a, b, out, detail::ArcTan2());
+  } else if (out.dtype() == bfloat16) {
+    binary_op<bfloat16_t>(a, b, out, detail::ArcTan2());
+  } else if (issubdtype(out.dtype(), inexact)) {
+    std::ostringstream err;
+    err << "[arctan2] Does not support " << out.dtype();
+    throw std::invalid_argument(err.str());
+  } else {
+    throw std::invalid_argument(
+        "[arctan2] Cannot compute inverse tangent for arrays"
+        " with non floating point type.");
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,6 +1,8 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
+#include <cassert>
+
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -0,0 +1,101 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+#ifdef ACCELERATE_NEW_LAPACK
+#include <Accelerate/Accelerate.h>
+#else
+#include <lapack.h>
+#endif
+
+namespace mlx::core {
+
+namespace {
+
+// Delegate to the Cholesky factorization taking into account differences in
+// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
+int spotrf_wrapper(char uplo, float* matrix, int N) {
+  int info;
+
+#ifdef LAPACK_FORTRAN_STRLEN_END
+  spotrf_(
+      /* uplo = */ &uplo,
+      /* n = */ &N,
+      /* a = */ matrix,
+      /* lda = */ &N,
+      /* info = */ &info,
+      /* uplo_len = */ static_cast<size_t>(1));
+#else
+  spotrf_(
+      /* uplo = */ &uplo,
+      /* n = */ &N,
+      /* a = */ matrix,
+      /* lda = */ &N,
+      /* info = */ &info);
+#endif
+
+  return info;
+}
+
+} // namespace
+
+void cholesky_impl(const array& a, array& factor, bool upper) {
+  // Lapack uses the column-major convention. We take advantage of the fact that
+  // the matrix should be symmetric:
+  //   (A)ᵀ = A
+  // and that a column-major lower triangular matrix is a row-major upper
+  // triangular matrix, so uplo is the opposite of what we would expect from
+  // upper
+
+  char uplo = (upper) ? 'L' : 'U';
+
+  // The decomposition is computed in place, so just copy the input to the
+  // output.
+  copy(
+      a,
+      factor,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
+
+  const int N = a.shape(-1);
+  const size_t num_matrices = a.size() / (N * N);
+
+  float* matrix = factor.data<float>();
+
+  for (int i = 0; i < num_matrices; i++) {
+    // Compute Cholesky factorization.
+    int info = spotrf_wrapper(uplo, matrix, N);
+
+    // TODO: We do nothing when the matrix is not positive semi-definite
+    // because throwing an error would result in a crash. If we figure out how
+    // to catch errors from the implementation we should throw.
+    if (info < 0) {
+      std::stringstream msg;
+      msg << "[cholesky] Cholesky decomposition failed with error code "
+          << info;
+      throw std::runtime_error(msg.str());
+    }
+
+    // Zero out the upper/lower triangle while advancing the pointer to the
+    // next matrix at the same time.
+    for (int row = 0; row < N; row++) {
+      if (upper) {
+        std::fill(matrix, matrix + row, 0);
+      } else {
+        std::fill(matrix + row + 1, matrix + N, 0);
+      }
+      matrix += N;
+    }
+  }
+}
+
+void Cholesky::eval(const std::vector<array>& inputs, array& output) {
+  if (inputs[0].dtype() != float32) {
+    throw std::runtime_error("[Cholesky::eval] only supports float32.");
+  }
+  cholesky_impl(inputs[0], output, upper_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -0,0 +1,347 @@
+// Copyright © 2024 Apple Inc.
+#include <cassert>
+
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+void AsStrided::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  if (!in.flags().row_contiguous) {
+    // Just ensuring that inputs[0] came from the ops which would ensure the
+    // input is row contiguous.
+    throw std::runtime_error(
+        "AsStrided must be used with row contiguous arrays only.");
+  }
+
+  // Compute the flags given the shape and strides
+  bool row_contiguous = true, col_contiguous = true;
+  size_t r = 1, c = 1;
+  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
+    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
+    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
+    r *= shape_[i];
+    c *= shape_[j];
+  }
+  auto flags = in.flags();
+  // TODO: Compute the contiguous flag in a better way cause now we are
+  //       unnecessarily strict.
+  flags.contiguous = row_contiguous || col_contiguous;
+  flags.row_contiguous = row_contiguous;
+  flags.col_contiguous = col_contiguous;
+
+  // There is no easy way to compute the actual data size so we use out.size().
+  // The contiguous flag will almost certainly not be set so no code should
+  // rely on data_size anyway.
+  size_t data_size = out.size();
+
+  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
+}
+
+void Broadcast::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  std::vector<size_t> strides(out.ndim(), 0);
+  int diff = out.ndim() - in.ndim();
+  for (int i = in.ndim() - 1; i >= 0; --i) {
+    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
+  }
+  auto flags = in.flags();
+  if (out.size() > in.size()) {
+    flags.row_contiguous = flags.col_contiguous = false;
+  }
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+}
+
+void Copy::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  out.copy_shared_buffer(inputs[0]);
+}
+
+void CustomVJP::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+       i++, j++) {
+    outputs[i].copy_shared_buffer(inputs[j]);
+  }
+}
+
+void Depends::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0; i < outputs.size(); i++) {
+    outputs[i].copy_shared_buffer(inputs[i]);
+  }
+}
+
+void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  double numel = 1;
+  for (auto ax : axes_) {
+    numel *= inputs[0].shape(ax);
+  }
+
+  if (inverted_) {
+    numel = 1.0 / numel;
+  }
+
+  switch (out.dtype()) {
+    case bool_:
+      *out.data<bool>() = static_cast<bool>(numel);
+      break;
+    case uint8:
+      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
+      break;
+    case uint16:
+      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
+      break;
+    case uint32:
+      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
+      break;
+    case uint64:
+      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
+      break;
+    case int8:
+      *out.data<int8_t>() = static_cast<int8_t>(numel);
+      break;
+    case int16:
+      *out.data<int16_t>() = static_cast<int16_t>(numel);
+      break;
+    case int32:
+      *out.data<int32_t>() = static_cast<int32_t>(numel);
+      break;
+    case int64:
+      *out.data<int64_t>() = static_cast<int64_t>(numel);
+      break;
+    case float16:
+      *out.data<float16_t>() = static_cast<float16_t>(numel);
+      break;
+    case float32:
+      *out.data<float>() = static_cast<float>(numel);
+      break;
+    case bfloat16:
+      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
+      break;
+    case complex64:
+      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
+      break;
+  }
+}
+
+std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
+    const array& in,
+    const array& out) {
+  // Special case for empty arrays or row contiguous arrays
+  if (in.size() == 0 || in.flags().row_contiguous) {
+    return {false, out.strides()};
+  }
+
+  // Special case for scalars
+  if (in.ndim() == 0) {
+    std::vector<size_t> out_strides(out.ndim(), 0);
+    return {false, out_strides};
+  }
+
+  // Firstly let's collapse all the contiguous dimensions of the input
+  auto [shape, _strides] = collapse_contiguous_dims(in);
+  auto& strides = _strides[0];
+
+  // If shapes fit exactly in the contiguous dims then no copy is necessary so
+  // let's check.
+  std::vector<size_t> out_strides;
+  bool copy_necessary = false;
+  int j = 0;
+  for (int i = 0; i < out.ndim(); i++) {
+    int N = out.shape(i);
+    if (j < shape.size() && shape[j] % N == 0) {
+      shape[j] /= N;
+      out_strides.push_back(shape[j] * strides[j]);
+      j += (shape[j] == 1);
+    } else if (N == 1) {
+      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
+      out_strides.push_back(out_strides.back());
+    } else {
+      copy_necessary = true;
+      break;
+    }
+  }
+
+  return {copy_necessary, out_strides};
+}
+
+void Reshape::shared_buffer_reshape(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    array& out) {
+  auto flags = in.flags();
+  if (flags.row_contiguous) {
+    // For row contiguous reshapes:
+    // - Shallow copy the buffer
+    // - If reshaping into a vector (all singleton dimensions except one) it
+    //    becomes col contiguous again.
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+  }
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+}
+
+void Split::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  auto compute_new_flags = [](const auto& shape,
+                              const auto& strides,
+                              size_t in_data_size,
+                              auto flags) {
+    size_t data_size = 1;
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.row_contiguous = true;
+    flags.col_contiguous = true;
+    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
+      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
+      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
+      f_stride *= shape[i];
+      b_stride *= shape[ri];
+      if (strides[i] > 0) {
+        data_size *= shape[i];
+      }
+    }
+
+    if (data_size == 1) {
+      // Broadcasted scalar array is contiguous.
+      flags.contiguous = true;
+    } else if (data_size == in_data_size) {
+      // Means we sliced a broadcasted dimension so leave the "no holes" flag
+      // alone.
+    } else {
+      // We sliced something. So either we are row or col contiguous or we
+      // punched a hole.
+      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+    }
+
+    return std::pair<decltype(flags), size_t>{flags, data_size};
+  };
+
+  std::vector<int> indices(1, 0);
+  indices.insert(indices.end(), indices_.begin(), indices_.end());
+  for (int i = 0; i < indices.size(); i++) {
+    size_t offset = indices[i] * in.strides()[axis_];
+    auto [new_flags, data_size] = compute_new_flags(
+        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
+    outputs[i].copy_shared_buffer(
+        in, in.strides(), new_flags, data_size, offset);
+  }
+}
+
+std::tuple<bool, int64_t, std::vector<int64_t>> Slice::prepare_slice(
+    const array& in) {
+  int64_t data_offset = 0;
+  bool copy_needed = false;
+  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  for (int i = 0; i < in.ndim(); ++i) {
+    data_offset += start_indices_[i] * in.strides()[i];
+    inp_strides[i] = in.strides()[i] * strides_[i];
+
+    copy_needed |= strides_[i] < 0;
+  }
+
+  return std::make_tuple(copy_needed, data_offset, inp_strides);
+}
+
+void Slice::shared_buffer_slice(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    size_t data_offset,
+    array& out) {
+  // Compute row/col contiguity
+  auto [data_size, is_row_contiguous, is_col_contiguous] =
+      check_contiguity(out.shape(), out_strides);
+
+  auto flags = in.flags();
+  flags.row_contiguous = is_row_contiguous;
+  flags.col_contiguous = is_col_contiguous;
+
+  if (data_size == 1) {
+    // Broadcasted scalar array is contiguous.
+    flags.contiguous = true;
+  } else if (data_size == in.data_size()) {
+    // Means we sliced a broadcasted dimension so leave the "no holes" flag
+    // alone.
+  } else {
+    // We sliced something. So either we are row or col contiguous or we
+    // punched a hole.
+    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+  }
+
+  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
+}
+
+std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
+    const array& in) {
+  int64_t data_offset = 0;
+  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  for (int i = 0; i < in.ndim(); ++i) {
+    data_offset += start_indices_[i] * in.strides()[i];
+    inp_strides[i] = in.strides()[i] * strides_[i];
+  }
+
+  return std::make_tuple(data_offset, inp_strides);
+}
+
+void StopGradient::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  out.copy_shared_buffer(inputs[0]);
+}
+
+void Transpose::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  std::vector<size_t> out_strides(out.ndim());
+  auto& in = inputs[0];
+  for (int ax = 0; ax < axes_.size(); ++ax) {
+    out_strides[ax] = in.strides()[axes_[ax]];
+  }
+
+  // Conditions for {row/col}_contiguous
+  // - array must be contiguous (no gaps)
+  // - underlying buffer size should have the same size as the array
+  // - cumulative product of shapes is equal to the strides (we can ignore axes
+  //   with size == 1)
+  //   - in the forward direction (column contiguous)
+  //   - in the reverse direction (row contiguous)
+  // - vectors are both row and col contiguous (hence if both row/col are
+  //   true, they stay true)
+  auto flags = in.flags();
+  if (flags.contiguous && in.data_size() == in.size()) {
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.col_contiguous = true;
+    flags.row_contiguous = true;
+    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
+      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
+      f_stride *= out.shape(i);
+      flags.row_contiguous &=
+          (out_strides[ri] == b_stride || out.shape(ri) == 1);
+      b_stride *= out.shape(ri);
+    }
+  }
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -111,13 +111,17 @@ void slow_conv_2D(
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
  const int iW = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
+  const int C = in.shape(3); // In channels
  const int oH = out.shape(1); // Output spatial dim
  const int oW = out.shape(2); // Output spatial dim
  const int O = wt.shape(0); // Out channels
-  const int C = wt.shape(3); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int wW = wt.shape(2); // Weight spatial dim

+  const int groups = C / wt.shape(3);
+  const int C_per_group = wt.shape(3);
+  const int O_per_group = O / groups;
+
  const size_t in_stride_N = in.strides()[0];
  const size_t in_stride_H = in.strides()[1];
  const size_t in_stride_W = in.strides()[2];
@@ -141,33 +145,35 @@ void slow_conv_2D(
        int ih_base = oh * wt_strides[0] - padding[0];
        int iw_base = ow * wt_strides[1] - padding[1];

-        for (int o = 0; o < O; ++o) {
-          float r = 0.;
+        for (int g = 0; g < groups; ++g) {
+          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+            float r = 0.;

-          for (int wh = 0; wh < wH; ++wh) {
-            for (int ww = 0; ww < wW; ++ww) {
-              int wh_flip = flip ? wH - wh - 1 : wh;
-              int ww_flip = flip ? wW - ww - 1 : ww;
-              int ih = ih_base + wh_flip * wt_dilation[0];
-              int iw = iw_base + ww_flip * wt_dilation[1];
+            for (int wh = 0; wh < wH; ++wh) {
+              for (int ww = 0; ww < wW; ++ww) {
+                int wh_flip = flip ? wH - wh - 1 : wh;
+                int ww_flip = flip ? wW - ww - 1 : ww;
+                int ih = ih_base + wh_flip * wt_dilation[0];
+                int iw = iw_base + ww_flip * wt_dilation[1];

-              const T* wt_ptr_pt = wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
-              const T* in_ptr_pt = in_ptr + ih * in_stride_H + iw * in_stride_W;
+                const T* wt_ptr_pt =
+                    wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                const T* in_ptr_pt =
+                    in_ptr + ih * in_stride_H + iw * in_stride_W;

-              for (int c = 0; c < C; ++c) {
-                r += static_cast<float>(in_ptr_pt[0]) *
-                    static_cast<float>(wt_ptr_pt[0]);
-                in_ptr_pt += in_stride_C;
-                wt_ptr_pt += wt_stride_C;
-              } // c
+                for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
+                  r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                      static_cast<float>(
+                           wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                } // c
+              } // ww
+            } // wh

-            } // ww
-          } // wh
-
-          out_ptr[0] = static_cast<T>(r);
-          out_ptr += out_stride_O;
-          wt_ptr += wt_stride_O;
-        } // o
+            out_ptr[0] = static_cast<T>(r);
+            out_ptr += out_stride_O;
+            wt_ptr += wt_stride_O;
+          } // o
+        } // g
      };

  int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
@@ -219,41 +225,43 @@ void slow_conv_2D(
        int wh_base = base_h[oh % f_out_jump_h];
        int ww_base = base_w[ow % f_out_jump_w];

-        for (int o = 0; o < O; ++o) {
-          float r = 0.;
+        for (int g = 0; g < groups; ++g) {
+          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
+            float r = 0.;

-          for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
-            for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
-              int wh_flip = flip ? wH - wh - 1 : wh;
-              int ww_flip = flip ? wW - ww - 1 : ww;
-              int ih = ih_base + wh_flip * wt_dilation[0];
-              int iw = iw_base + ww_flip * wt_dilation[1];
+            for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
+              for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
+                int wh_flip = flip ? wH - wh - 1 : wh;
+                int ww_flip = flip ? wW - ww - 1 : ww;
+                int ih = ih_base + wh_flip * wt_dilation[0];
+                int iw = iw_base + ww_flip * wt_dilation[1];

-              if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
-                const T* wt_ptr_pt =
-                    wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
+                  const T* wt_ptr_pt =
+                      wt_ptr + wh * wt_stride_H + ww * wt_stride_W;

-                int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
-                int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;
+                  int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
+                  int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;

-                const T* in_ptr_pt =
-                    in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;
+                  const T* in_ptr_pt =
+                      in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;

-                for (int c = 0; c < C; ++c) {
-                  r += static_cast<float>(in_ptr_pt[0]) *
-                      static_cast<float>(wt_ptr_pt[0]);
-                  in_ptr_pt += in_stride_C;
-                  wt_ptr_pt += wt_stride_C;
-                } // c
+                  for (int c = g * C_per_group; c < (g + 1) * C_per_group;
+                       ++c) {
+                    r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
+                        static_cast<float>(
+                             wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
+                  } // c

-              } // ih, iw check
-            } // ww
-          } // wh
+                } // ih, iw check
+              } // ww
+            } // wh

-          out_ptr[0] = static_cast<T>(r);
-          out_ptr += out_stride_O;
-          wt_ptr += wt_stride_O;
-        } // o
+            out_ptr[0] = static_cast<T>(r);
+            out_ptr += out_stride_O;
+            wt_ptr += wt_stride_O;
+          } // o
+        } // g
      };

  int oH_border_0 = 0;
@@ -310,6 +318,296 @@ void slow_conv_2D(
  } // n
 }

+template <typename T>
+void slow_conv_3D(
+    const array& in,
+    const array& wt,
+    array out,
+    const std::vector<int>& padding,
+    const std::vector<int>& wt_strides,
+    const std::vector<int>& wt_dilation,
+    const std::vector<int>& in_dilation,
+    bool flip) {
+  const T* st_wt_ptr = wt.data<T>();
+  const T* st_in_ptr = in.data<T>();
+  T* st_out_ptr = out.data<T>();
+
+  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
+  const int iD = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
+  const int iH = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
+  const int iW = 1 + in_dilation[2] * (in.shape(3) - 1); // Input spatial dim
+  const int oD = out.shape(1); // Output spatial dim
+  const int oH = out.shape(2); // Output spatial dim
+  const int oW = out.shape(3); // Output spatial dim
+  const int O = wt.shape(0); // Out channels
+  const int C = wt.shape(4); // In channels
+  const int wD = wt.shape(1); // Weight spatial dim
+  const int wH = wt.shape(2); // Weight spatial dim
+  const int wW = wt.shape(3); // Weight spatial dim
+
+  const size_t in_stride_N = in.strides()[0];
+  const size_t in_stride_D = in.strides()[1];
+  const size_t in_stride_H = in.strides()[2];
+  const size_t in_stride_W = in.strides()[3];
+  const size_t in_stride_C = in.strides()[4];
+
+  const size_t wt_stride_O = wt.strides()[0];
+  const size_t wt_stride_D = wt.strides()[1];
+  const size_t wt_stride_H = wt.strides()[2];
+  const size_t wt_stride_W = wt.strides()[3];
+  const size_t wt_stride_C = wt.strides()[4];
+
+  const size_t out_stride_N = out.strides()[0];
+  const size_t out_stride_D = out.strides()[1];
+  const size_t out_stride_H = out.strides()[2];
+  const size_t out_stride_W = out.strides()[3];
+  const size_t out_stride_O = out.strides()[4];
+
+  bool is_idil_one =
+      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1;
+
+  auto pt_conv_no_checks = [&](const T* in_ptr,
+                               const T* wt_ptr,
+                               T* out_ptr,
+                               int od,
+                               int oh,
+                               int ow) {
+    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
+    int id_base = od * wt_strides[0] - padding[0];
+    int ih_base = oh * wt_strides[1] - padding[1];
+    int iw_base = ow * wt_strides[2] - padding[2];
+
+    for (int o = 0; o < O; ++o) {
+      float r = 0.;
+
+      for (int wd = 0; wd < wD; ++wd) {
+        for (int wh = 0; wh < wH; ++wh) {
+          for (int ww = 0; ww < wW; ++ww) {
+            int wd_flip = flip ? wD - wd - 1 : wd;
+            int wh_flip = flip ? wH - wh - 1 : wh;
+            int ww_flip = flip ? wW - ww - 1 : ww;
+            int id = id_base + wd_flip * wt_dilation[0];
+            int ih = ih_base + wh_flip * wt_dilation[1];
+            int iw = iw_base + ww_flip * wt_dilation[2];
+
+            const T* wt_ptr_pt =
+                wt_ptr + wd * wt_stride_D + wh * wt_stride_H + ww * wt_stride_W;
+            const T* in_ptr_pt =
+                in_ptr + id * in_stride_D + ih * in_stride_H + iw * in_stride_W;
+
+            for (int c = 0; c < C; ++c) {
+              r += static_cast<float>(in_ptr_pt[0]) *
+                  static_cast<float>(wt_ptr_pt[0]);
+              in_ptr_pt += in_stride_C;
+              wt_ptr_pt += wt_stride_C;
+            } // c
+
+          } // ww
+        } // wh
+      } // wd
+
+      out_ptr[0] = static_cast<T>(r);
+      out_ptr += out_stride_O;
+      wt_ptr += wt_stride_O;
+    } // o
+  };
+
+  int jump_d = flip ? -wt_dilation[0] : wt_dilation[0];
+  int jump_h = flip ? -wt_dilation[1] : wt_dilation[1];
+  int jump_w = flip ? -wt_dilation[2] : wt_dilation[2];
+
+  int init_d = (flip ? (wD - 1) * wt_dilation[0] : 0);
+  int init_h = (flip ? (wH - 1) * wt_dilation[1] : 0);
+  int init_w = (flip ? (wW - 1) * wt_dilation[2] : 0);
+
+  int f_wgt_jump_d = std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
+  int f_wgt_jump_h = std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
+  int f_wgt_jump_w = std::lcm(in_dilation[2], wt_dilation[2]) / wt_dilation[2];
+
+  int f_out_jump_d = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
+  int f_out_jump_h = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
+  int f_out_jump_w = std::lcm(in_dilation[2], wt_strides[2]) / wt_strides[2];
+
+  std::vector<int> base_d(f_out_jump_d);
+  std::vector<int> base_h(f_out_jump_h);
+  std::vector<int> base_w(f_out_jump_w);
+
+  for (int i = 0; i < f_out_jump_d; ++i) {
+    int id_loop = i * wt_strides[0] - padding[0] + init_d;
+
+    int wd_base = 0;
+    while (wd_base < wD && id_loop % in_dilation[0] != 0) {
+      wd_base++;
+      id_loop += jump_d;
+    }
+
+    base_d[i] = wd_base;
+  }
+
+  for (int i = 0; i < f_out_jump_h; ++i) {
+    int ih_loop = i * wt_strides[1] - padding[1] + init_h;
+
+    int wh_base = 0;
+    while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
+      wh_base++;
+      ih_loop += jump_h;
+    }
+
+    base_h[i] = wh_base;
+  }
+
+  for (int j = 0; j < f_out_jump_w; ++j) {
+    int iw_loop = j * wt_strides[2] - padding[2] + init_w;
+
+    int ww_base = 0;
+    while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
+      ww_base++;
+      iw_loop += jump_w;
+    }
+
+    base_w[j] = ww_base;
+  }
+
+  auto pt_conv_all_checks = [&](const T* in_ptr,
+                                const T* wt_ptr,
+                                T* out_ptr,
+                                int od,
+                                int oh,
+                                int ow) {
+    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
+
+    int id_base = od * wt_strides[0] - padding[0];
+    int ih_base = oh * wt_strides[1] - padding[1];
+    int iw_base = ow * wt_strides[2] - padding[2];
+
+    int wd_base = base_d[od % f_out_jump_d];
+    int wh_base = base_h[oh % f_out_jump_h];
+    int ww_base = base_w[ow % f_out_jump_w];
+
+    for (int o = 0; o < O; ++o) {
+      float r = 0.;
+
+      for (int wd = wd_base; wd < wD; wd += f_wgt_jump_d) {
+        for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
+          for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
+            int wd_flip = flip ? wD - wd - 1 : wd;
+            int wh_flip = flip ? wH - wh - 1 : wh;
+            int ww_flip = flip ? wW - ww - 1 : ww;
+            int id = id_base + wd_flip * wt_dilation[0];
+            int ih = ih_base + wh_flip * wt_dilation[1];
+            int iw = iw_base + ww_flip * wt_dilation[2];
+
+            if (id >= 0 && id < iD && ih >= 0 && ih < iH && iw >= 0 &&
+                iw < iW) {
+              const T* wt_ptr_pt = wt_ptr + wd * wt_stride_D +
+                  wh * wt_stride_H + ww * wt_stride_W;
+
+              int id_dil = !is_idil_one ? (id / in_dilation[0]) : id;
+              int ih_dil = !is_idil_one ? (ih / in_dilation[1]) : ih;
+              int iw_dil = !is_idil_one ? (iw / in_dilation[2]) : iw;
+
+              const T* in_ptr_pt = in_ptr + id_dil * in_stride_D +
+                  ih_dil * in_stride_H + iw_dil * in_stride_W;
+
+              for (int c = 0; c < C; ++c) {
+                r += static_cast<float>(in_ptr_pt[0]) *
+                    static_cast<float>(wt_ptr_pt[0]);
+                in_ptr_pt += in_stride_C;
+                wt_ptr_pt += wt_stride_C;
+              } // c
+
+            } // iD, ih, iw check
+          } // ww
+        } // wh
+      } // wd
+
+      out_ptr[0] = static_cast<T>(r);
+      out_ptr += out_stride_O;
+      wt_ptr += wt_stride_O;
+    } // o
+  };
+
+  int oD_border_0 = 0;
+  int oD_border_1 =
+      is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oD;
+  int oD_border_2 = std::max(
+      oD_border_1, (iD + padding[0] - wD * wt_dilation[0]) / wt_strides[0]);
+  int oD_border_3 = oD;
+
+  int oH_border_0 = 0;
+  int oH_border_1 =
+      is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oH;
+  int oH_border_2 = std::max(
+      oH_border_1, (iH + padding[1] - wH * wt_dilation[1]) / wt_strides[1]);
+  int oH_border_3 = oH;
+
+  int oW_border_0 = 0;
+  int oW_border_1 =
+      is_idil_one ? ((padding[2] + wt_strides[2] - 1) / wt_strides[2]) : oW;
+  int oW_border_2 = std::max(
+      oW_border_1, (iW + padding[2] - wW * wt_dilation[2]) / wt_strides[2]);
+  int oW_border_3 = oW;
+
+  for (int n = 0; n < N; ++n) {
+    // Case 1: od might put us out of bounds
+    for (int od = oD_border_0; od < oD_border_1; ++od) {
+      for (int oh = 0; oh < oH; ++oh) {
+        for (int ow = 0; ow < oW; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+      } // oh
+    } // od
+
+    // Case 2: od in bounds
+    for (int od = oD_border_1; od < oD_border_2; ++od) {
+      // Case 2.1: oh might put us out of bounds
+      for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
+        for (int ow = 0; ow < oW; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+      } // oh
+
+      // Case 2.2: oh in bounds
+      for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
+        // Case 2.2.1: ow might put us out of bounds
+        for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+
+        // Case 2.2.2: ow in bounds
+        for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
+          pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+
+        // Case 2.2.3: ow might put us out of bounds
+        for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+      } // oh
+
+      // Case 2.3: oh might put us out of bounds
+      for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
+        for (int ow = 0; ow < oW; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+      } // oh
+    } // od
+
+    // Case 3: od might put us out of bounds
+    for (int od = oD_border_2; od < oD_border_3; ++od) {
+      for (int oh = 0; oh < oH; ++oh) {
+        for (int ow = 0; ow < oW; ++ow) {
+          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
+        } // ow
+      } // oh
+    } // od
+
+    st_in_ptr += in_stride_N;
+    st_out_ptr += out_stride_N;
+
+  } // n
+}
+
 void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
@@ -358,6 +656,30 @@ void dispatch_slow_conv_2D(
  }
 }

+void dispatch_slow_conv_3D(
+    const array& in,
+    const array& wt,
+    array out,
+    const std::vector<int>& padding,
+    const std::vector<int>& wt_strides,
+    const std::vector<int>& wt_dilation,
+    const std::vector<int>& in_dilation,
+    bool flip) {
+  if (in.dtype() == float32) {
+    return slow_conv_3D<float>(
+        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
+  } else if (in.dtype() == float16) {
+    return slow_conv_3D<float16_t>(
+        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
+  } else if (in.dtype() == bfloat16) {
+    return slow_conv_3D<bfloat16_t>(
+        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
+  } else {
+    throw std::invalid_argument(
+        "[Convolution::eval] got unsupported data type.");
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Explicit gemm conv
 ///////////////////////////////////////////////////////////////////////////////
@@ -582,6 +904,131 @@ void explicit_gemm_conv_2D_cpu(
  }
 }

+void explicit_gemm_conv_ND_cpu(
+    const array& in,
+    const array& wt,
+    array out,
+    const std::vector<int>& padding,
+    const std::vector<int>& wt_strides,
+    const std::vector<int>& wt_dilation) {
+  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
+  const auto iDim = std::vector<int>(
+      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
+  const auto oDim = std::vector<int>(
+      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
+  const int O = wt.shape(0); // Out channels
+  const int C = wt.shape(-1); // In channels
+  const auto wDim = std::vector<int>(
+      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
+
+  auto conv_dtype = float32;
+
+  // Pad input
+  std::vector<int> padded_shape(in.shape().size());
+  padded_shape.front() = N;
+  for (size_t i = 0; i < iDim.size(); i++) {
+    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
+  }
+  padded_shape.back() = C;
+  array in_padded(padded_shape, conv_dtype, nullptr, {});
+
+  // Fill with zeros
+  copy(array(0, conv_dtype), in_padded, CopyType::Scalar);
+
+  // Pick input slice from padded
+  size_t data_offset = 0;
+  for (size_t i = 0; i < padding.size(); i++) {
+    data_offset += padding[i] * in_padded.strides()[i + 1];
+  }
+  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
+  in_padded_slice.copy_shared_buffer(
+      in_padded,
+      in_padded.strides(),
+      in_padded.flags(),
+      in_padded_slice.size(),
+      data_offset);
+
+  // Copy input values into the slice
+  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
+
+  // Make strided view
+  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
+  strided_shape.front() = N;
+  for (size_t i = 0; i < oDim.size(); i++) {
+    strided_shape[i + 1] = oDim[i];
+  }
+  for (size_t i = 0; i < wDim.size(); i++) {
+    strided_shape[i + 1 + oDim.size()] = wDim[i];
+  }
+  strided_shape.back() = C;
+
+  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
+  strided_strides[0] = in_padded.strides()[0];
+  for (size_t i = 0; i < wt_strides.size(); i++) {
+    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
+  }
+  for (size_t i = 1; i < in_padded.strides().size(); i++) {
+    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
+  }
+
+  auto flags = in_padded.flags();
+
+  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
+  in_strided_view.copy_shared_buffer(
+      in_padded, strided_strides, flags, in_strided_view.size(), 0);
+
+  // Materialize strided view
+  std::vector<int> strided_reshape = {N, C};
+  for (const auto& o : oDim) {
+    strided_reshape[0] *= o;
+  }
+  for (const auto& w : wDim) {
+    strided_reshape[1] *= w;
+  }
+
+  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
+  copy(in_strided_view, in_strided, CopyType::General);
+
+  // Check wt dtype and prepare
+  auto gemm_wt = wt;
+  auto gemm_out = out;
+
+  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
+    auto ctype =
+        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
+    gemm_wt = array(wt.shape(), float32, nullptr, {});
+    copy(wt, gemm_wt, ctype);
+  }
+
+  if (out.dtype() != float32) {
+    gemm_out = array(out.shape(), float32, nullptr, {});
+    gemm_out.set_data(allocator::malloc_or_wait(gemm_out.nbytes()));
+  }
+
+  // Perform gemm
+  cblas_sgemm(
+      CblasRowMajor,
+      CblasNoTrans, // no trans A
+      CblasTrans, // transB
+      strided_reshape[0], // M
+      O, // N
+      strided_reshape[1], // K
+      1.0f, // alpha
+      in_strided.data<float>(),
+      strided_reshape[1], // lda
+      gemm_wt.data<float>(),
+      strided_reshape[1], // ldb
+      0.0f, // beta
+      gemm_out.data<float>(),
+      O // ldc
+  );
+
+  // Copy results if needed
+  if (out.dtype() != float32) {
+    copy(gemm_out, out, CopyType::Vector);
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Conv routing
 ///////////////////////////////////////////////////////////////////////////////
@@ -617,6 +1064,19 @@ void conv_2D_cpu(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }

+void conv_3D_cpu(
+    const array& in,
+    const array& wt,
+    array out,
+    const std::vector<int>& padding,
+    const std::vector<int>& wt_strides,
+    const std::vector<int>& wt_dilation,
+    const std::vector<int>& in_dilation,
+    bool flip) {
+  return dispatch_slow_conv_3D(
+      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
+}
+
 } // namespace

 void Convolution::eval(const std::vector<array>& inputs, array& out) {
@@ -625,8 +1085,20 @@ void Convolution::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  auto& wt = inputs[1];

+  // 3D convolution
+  if (in.ndim() == (3 + 2)) {
+    return conv_3D_cpu(
+        in,
+        wt,
+        out,
+        padding_,
+        kernel_strides_,
+        kernel_dilation_,
+        input_dilation_,
+        flip_);
+  }
  // 2D convolution
-  if (in.ndim() == (2 + 2)) {
+  else if (in.ndim() == (2 + 2)) {
    return conv_2D_cpu(
        in,
        wt,
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -256,7 +256,7 @@ void copy_general_general(
  }

  int size = std::accumulate(
-      data_shape.begin() - 5, data_shape.end(), 1, std::multiplies<int>());
+      data_shape.end() - 5, data_shape.end(), 1, std::multiplies<int>());
  for (int i = 0; i < src.size(); i += size) {
    stride_t src_offset = i_offset + elem_to_loc(i, data_shape, i_strides);
    stride_t dst_offset = o_offset + elem_to_loc(i, dst.shape(), o_strides);
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -34,6 +34,7 @@ DEFAULT(ArcCosh)
 DEFAULT(ArcSin)
 DEFAULT(ArcSinh)
 DEFAULT(ArcTan)
+DEFAULT(ArcTan2)
 DEFAULT(ArcTanh)
 DEFAULT(ArgPartition)
 DEFAULT(ArgReduce)
@@ -42,10 +43,12 @@ DEFAULT(AsType)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
 DEFAULT(BlockMaskedMM)
-DEFAULT(BlockSparseMM)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
 DEFAULT_MULTI(DivMod)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
+DEFAULT(Conjugate)
 DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
@@ -110,6 +113,7 @@ DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
+DEFAULT(Cholesky)

 namespace {

--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -2,7 +2,6 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
-#include "mlx/linalg.h"
 #include "mlx/primitives.h"

 #ifdef ACCELERATE_NEW_LAPACK
@@ -93,12 +92,4 @@ void Inverse::eval(const std::vector<array>& inputs, array& output) {
  inverse_impl(inputs[0], output);
 }

-std::pair<std::vector<array>, std::vector<int>> Inverse::vmap(
-    const std::vector<array>& inputs,
-    const std::vector<int>& axes) {
-  auto ax = axes[0] >= 0 ? 0 : -1;
-  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
-  return {{linalg::inv(a, stream())}, {ax}};
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -17,24 +17,25 @@ namespace mlx::core {

 namespace {

-template <typename T>
+template <typename T, typename mask_t>
 inline void mask_matrix(
    T* data,
-    const bool* mask,
+    const mask_t* mask,
    int block_size,
    const int X,
    const int Y,
    const size_t X_data_str,
    const size_t Y_data_str,
    const size_t X_mask_str,
-    const size_t Y_mask_str) {
+    const size_t Y_mask_str,
+    const size_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

  for (int i = 0; i < tX; i++) {
    for (int j = 0; j < tY; j++) {
-      bool do_mask = mask[i * X_mask_str + j * Y_mask_str];
-      if (!do_mask) {
+      mask_t do_mask = mask[mask_offset + i * X_mask_str + j * Y_mask_str];
+      if (do_mask != 1) {
        int loc_x = i * block_size;
        int loc_y = j * block_size;
        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
@@ -43,7 +44,11 @@ inline void mask_matrix(
        int size_y = std::min(block_size, Y - loc_y);
        for (int ii = 0; ii < size_x; ii++) {
          for (int jj = 0; jj < size_y; jj++) {
-            data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
+            if constexpr (std::is_same_v<mask_t, bool>) {
+              data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
+            } else {
+              data_block[ii * X_data_str + jj * Y_data_str] *= do_mask;
+            }
          }
        }
      }
@@ -62,36 +67,39 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
-  auto& out_mask = inputs[2];

-  auto check_transpose = [](const array& arr, bool do_copy) {
-    auto stx = arr.strides()[arr.ndim() - 2];
-    auto sty = arr.strides()[arr.ndim() - 1];
-    if (stx == arr.shape(-1) && sty == 1) {
-      if (do_copy) {
-        array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-        copy(arr, arr_copy, CopyType::Vector);
-        return std::make_tuple(false, stx, arr_copy);
-      }
-      return std::make_tuple(false, stx, arr);
-    } else if (stx == 1 && sty == arr.shape(-2)) {
-      if (do_copy) {
-        array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-        copy(arr, arr_copy, CopyType::Vector);
-        return std::make_tuple(true, sty, arr_copy);
-      }
-      return std::make_tuple(true, sty, arr);
-    } else {
-      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy(arr, arr_copy, CopyType::General);
-      size_t stx = arr.shape(-1);
-      return std::make_tuple(false, stx, arr_copy);
-    }
-  };
+  auto check_transpose =
+      [](const array& arr, bool do_copy, bool expand_all = false) {
+        auto stx = arr.strides()[arr.ndim() - 2];
+        auto sty = arr.strides()[arr.ndim() - 1];
+        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
+          if (do_copy) {
+            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+            copy(arr, arr_copy, CopyType::Vector);
+            return std::make_tuple(false, stx, arr_copy);
+          }
+          return std::make_tuple(false, stx, arr);
+        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
+          if (do_copy) {
+            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+            copy(arr, arr_copy, CopyType::Vector);
+            return std::make_tuple(true, sty, arr_copy);
+          }
+          return std::make_tuple(true, sty, arr);
+        } else {
+          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+          copy(arr, arr_copy, CopyType::General);
+          size_t stx = arr.shape(-1);
+          return std::make_tuple(false, stx, arr_copy);
+        }
+      };

  bool has_op_mask = inputs.size() > 3;
-  auto [a_transposed, lda, a] = check_transpose(a_pre, has_op_mask);
-  auto [b_transposed, ldb, b] = check_transpose(b_pre, has_op_mask);
+  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;
+  auto [a_transposed, lda, a] =
+      check_transpose(a_pre, has_op_mask, inputs.back().dtype() != bool_);
+  auto [b_transposed, ldb, b] =
+      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);

  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
@@ -114,27 +122,42 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str) {
-    const bool* mask_ptr = mask.data<bool>() +
-        elem_to_loc(mask.shape(-1) * mask.shape(-2) * batch_idx,
-                    mask.shape(),
-                    mask.strides());
+    size_t mask_offset = elem_to_loc(
+        mask.shape(-1) * mask.shape(-2) * batch_idx,
+        mask.shape(),
+        mask.strides());

    size_t X_mask_str = mask.strides()[mask.ndim() - 2];
    size_t Y_mask_str = mask.strides()[mask.ndim() - 1];

-    return mask_matrix(
-        data,
-        mask_ptr,
-        block_size,
-        X,
-        Y,
-        X_data_str,
-        Y_data_str,
-        X_mask_str,
-        Y_mask_str);
+    if (mask.dtype() == bool_) {
+      return mask_matrix(
+          data,
+          mask.data<bool>(),
+          block_size,
+          X,
+          Y,
+          X_data_str,
+          Y_data_str,
+          X_mask_str,
+          Y_mask_str,
+          mask_offset);
+    } else {
+      return mask_matrix(
+          data,
+          mask.data<float>(),
+          block_size,
+          X,
+          Y,
+          X_data_str,
+          Y_data_str,
+          X_mask_str,
+          Y_mask_str,
+          mask_offset);
+    }
  };

-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+  for (int i = 0; i < (out.size() / (M * size_t(N))); ++i) {
    // Adjust pointer
    float* ai =
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides());
@@ -144,7 +167,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {

    // Zero out blocks in a and b if needed
    if (has_op_mask) {
-      auto& a_mask = inputs[3];
+      auto& a_mask = inputs[inputs.size() - 2];
      mask_array(
          a_mask,
          ai,
@@ -155,7 +178,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
          a_transposed ? 1 : lda,
          a_transposed ? lda : 1);

-      auto& b_mask = inputs[4];
+      auto& b_mask = inputs[inputs.size() - 1];
      mask_array(
          b_mask,
          bi,
@@ -186,14 +209,16 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
    );

    // Zero out blocks in out
-    mask_array(out_mask, ci, block_size_, i, M, N, N, 1);
+    if (has_out_mask) {
+      mask_array(inputs[2], ci, block_size_, i, M, N, N, 1);
+    }
  }
 }

-void BlockSparseMM::eval(const std::vector<array>& inputs, array& out) {
+void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
-        "[BlockSparseMM::eval] Currently only supports float32.");
+        "[GatherMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

@@ -277,4 +302,4 @@ void BlockSparseMM::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -161,6 +161,13 @@ struct ArcTan {
  };
 };

+struct ArcTan2 {
+  template <typename T>
+  T operator()(T y, T x) {
+    return std::atan2(y, x);
+  };
+};
+
 struct ArcTanh {
  template <typename T>
  T operator()(T x) {
@@ -202,6 +209,12 @@ struct Ceil {
  };
 };

+struct Conjugate {
+  complex64_t operator()(complex64_t x) {
+    return std::conj(x);
+  }
+};
+
 struct Cos {
  template <typename T>
  T operator()(T x) {
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <algorithm>
 #include <cassert>
@@ -113,61 +113,6 @@ void AsType::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, ctype);
 }

-void AsStrided::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-
-  auto& in = inputs[0];
-
-  if (!in.flags().row_contiguous) {
-    // Just ensuring that inputs[0] came from the ops which would ensure the
-    // input is row contiguous.
-    throw std::runtime_error(
-        "AsStrided must be used with row contiguous arrays only.");
-  }
-
-  // Compute the flags given the shape and strides
-  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
-    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
-    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
-    r *= shape_[i];
-    c *= shape_[j];
-  }
-  auto flags = in.flags();
-  // TODO: Compute the contiguous flag in a better way cause now we are
-  //       unnecessarily strict.
-  flags.contiguous = row_contiguous || col_contiguous;
-  flags.row_contiguous = row_contiguous;
-  flags.col_contiguous = col_contiguous;
-
-  // There is no easy way to compute the actual data size so we use out.size().
-  // The contiguous flag will almost certainly not be set so no code should
-  // rely on data_size anyway.
-  size_t data_size = out.size();
-
-  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
-}
-
-void Broadcast::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  std::vector<size_t> strides(out.ndim(), 0);
-  int diff = out.ndim() - in.ndim();
-  for (int i = in.ndim() - 1; i >= 0; --i) {
-    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
-  }
-  auto flags = in.flags();
-  if (out.size() > in.size()) {
-    flags.row_contiguous = flags.col_contiguous = false;
-  }
-  out.copy_shared_buffer(in, strides, flags, in.data_size());
-}
-
 void Ceil::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -203,9 +148,15 @@ void Concatenate::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void Copy::eval(const std::vector<array>& inputs, array& out) {
+void Conjugate::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
+  const auto& in = inputs[0];
+  if (out.dtype() == complex64) {
+    unary_fp(in, out, detail::Conjugate());
+  } else {
+    throw std::invalid_argument(
+        "[conjugate] conjugate must be called on complex input.");
+  }
 }

 void Cos::eval(const std::vector<array>& inputs, array& out) {
@@ -232,81 +183,6 @@ void Cosh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void CustomVJP::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() > outputs.size());
-  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
-       i++, j++) {
-    outputs[i].copy_shared_buffer(inputs[j]);
-  }
-}
-
-void Depends::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() > outputs.size());
-  for (int i = 0; i < outputs.size(); i++) {
-    outputs[i].copy_shared_buffer(inputs[i]);
-  }
-}
-
-void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  double numel = 1;
-  for (auto ax : axes_) {
-    numel *= inputs[0].shape(ax);
-  }
-
-  if (inverted_) {
-    numel = 1.0 / numel;
-  }
-
-  switch (out.dtype()) {
-    case bool_:
-      *out.data<bool>() = static_cast<bool>(numel);
-      break;
-    case uint8:
-      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
-      break;
-    case uint16:
-      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
-      break;
-    case uint32:
-      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
-      break;
-    case uint64:
-      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
-      break;
-    case int8:
-      *out.data<int8_t>() = static_cast<int8_t>(numel);
-      break;
-    case int16:
-      *out.data<int16_t>() = static_cast<int16_t>(numel);
-      break;
-    case int32:
-      *out.data<int32_t>() = static_cast<int32_t>(numel);
-      break;
-    case int64:
-      *out.data<int64_t>() = static_cast<int64_t>(numel);
-      break;
-    case float16:
-      *out.data<float16_t>() = static_cast<float16_t>(numel);
-      break;
-    case float32:
-      *out.data<float>() = static_cast<float>(numel);
-      break;
-    case bfloat16:
-      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
-      break;
-    case complex64:
-      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
-      break;
-  }
-}
-
 void Erf::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -536,63 +412,6 @@ void RandomBits::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
-    const array& in,
-    const array& out) {
-  // Special case for empty arrays or row contiguous arrays
-  if (in.size() == 0 || in.flags().row_contiguous) {
-    return {false, out.strides()};
-  }
-
-  // Special case for scalars
-  if (in.ndim() == 0) {
-    std::vector<size_t> out_strides(out.ndim(), 0);
-    return {false, out_strides};
-  }
-
-  // Firstly let's collapse all the contiguous dimensions of the input
-  auto [shape, _strides] = collapse_contiguous_dims(in);
-  auto& strides = _strides[0];
-
-  // If shapes fit exactly in the contiguous dims then no copy is necessary so
-  // let's check.
-  std::vector<size_t> out_strides;
-  bool copy_necessary = false;
-  int j = 0;
-  for (int i = 0; i < out.ndim(); i++) {
-    int N = out.shape(i);
-    if (j < shape.size() && shape[j] % N == 0) {
-      shape[j] /= N;
-      out_strides.push_back(shape[j] * strides[j]);
-      j += (shape[j] == 1);
-    } else if (N == 1) {
-      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
-      out_strides.push_back(out_strides.back());
-    } else {
-      copy_necessary = true;
-      break;
-    }
-  }
-
-  return {copy_necessary, out_strides};
-}
-
-void Reshape::shared_buffer_reshape(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    array& out) {
-  auto flags = in.flags();
-  if (flags.row_contiguous) {
-    // For row contiguous reshapes:
-    // - Shallow copy the buffer
-    // - If reshaping into a vector (all singleton dimensions except one) it
-    //    becomes col contiguous again.
-    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
-    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
-  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
-}
-
 void Reshape::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -663,49 +482,6 @@ void Sinh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::tuple<bool, int64_t, std::vector<int64_t>> Slice::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  bool copy_needed = false;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-
-    copy_needed |= strides_[i] < 0;
-  }
-
-  return std::make_tuple(copy_needed, data_offset, inp_strides);
-}
-
-void Slice::shared_buffer_slice(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    size_t data_offset,
-    array& out) {
-  // Compute row/col contiguity
-  auto [data_size, is_row_contiguous, is_col_contiguous] =
-      check_contiguity(out.shape(), out_strides);
-
-  auto flags = in.flags();
-  flags.row_contiguous = is_row_contiguous;
-  flags.col_contiguous = is_col_contiguous;
-
-  if (data_size == 1) {
-    // Broadcasted scalar array is contiguous.
-    flags.contiguous = true;
-  } else if (data_size == in.data_size()) {
-    // Means we sliced a broadcasted dimension so leave the "no holes" flag
-    // alone.
-  } else {
-    // We sliced something. So either we are row or col contiguous or we
-    // punched a hole.
-    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
-  }
-
-  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
-}
-
 void Slice::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  if (out.size() == 0) {
@@ -737,18 +513,6 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-  }
-
-  return std::make_tuple(data_offset, inp_strides);
-}
-
 void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
@@ -786,58 +550,6 @@ void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
      /* CopyType ctype = */ CopyType::GeneralGeneral);
 }

-void Split::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() == 1);
-
-  auto& in = inputs[0];
-
-  auto compute_new_flags = [](const auto& shape,
-                              const auto& strides,
-                              size_t in_data_size,
-                              auto flags) {
-    size_t data_size = 1;
-    size_t f_stride = 1;
-    size_t b_stride = 1;
-    flags.row_contiguous = true;
-    flags.col_contiguous = true;
-    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
-      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
-      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
-      f_stride *= shape[i];
-      b_stride *= shape[ri];
-      if (strides[i] > 0) {
-        data_size *= shape[i];
-      }
-    }
-
-    if (data_size == 1) {
-      // Broadcasted scalar array is contiguous.
-      flags.contiguous = true;
-    } else if (data_size == in_data_size) {
-      // Means we sliced a broadcasted dimension so leave the "no holes" flag
-      // alone.
-    } else {
-      // We sliced something. So either we are row or col contiguous or we
-      // punched a hole.
-      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
-    }
-
-    return std::pair<decltype(flags), size_t>{flags, data_size};
-  };
-
-  std::vector<int> indices(1, 0);
-  indices.insert(indices.end(), indices_.begin(), indices_.end());
-  for (int i = 0; i < indices.size(); i++) {
-    size_t offset = indices[i] * in.strides()[axis_];
-    auto [new_flags, data_size] = compute_new_flags(
-        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
-    outputs[i].copy_shared_buffer(
-        in, in.strides(), new_flags, data_size, offset);
-  }
-}
-
 void Square::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -854,11 +566,6 @@ void Sqrt::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void StopGradient::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
-}
-
 void Tan::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -883,38 +590,4 @@ void Tanh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void Transpose::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  std::vector<size_t> out_strides(out.ndim());
-  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
-    out_strides[ax] = in.strides()[axes_[ax]];
-  }
-
-  // Conditions for {row/col}_contiguous
-  // - array must be contiguous (no gaps)
-  // - underlying buffer size should have the same size as the array
-  // - cumulative product of shapes is equal to the strides (we can ignore axes
-  //   with size == 1)
-  //   - in the forward direction (column contiguous)
-  //   - in the reverse direction (row contiguous)
-  // - vectors are both row and col contiguous (hence if both row/col are
-  //   true, they stay true)
-  auto flags = in.flags();
-  if (flags.contiguous && in.data_size() == in.size()) {
-    size_t f_stride = 1;
-    size_t b_stride = 1;
-    flags.col_contiguous = true;
-    flags.row_contiguous = true;
-    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
-      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
-      f_stride *= out.shape(i);
-      flags.row_contiguous &=
-          (out_strides[ri] == b_stride || out.shape(ri) == 1);
-      b_stride *= out.shape(ri);
-    }
-  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -192,7 +192,7 @@ void _qmm_dispatch_typed(
 }

 void _qmm_dispatch(
-    array out,
+    array& out,
    const array& x,
    const array& w,
    const array& scales,
@@ -253,6 +253,81 @@ void _qmm_dispatch(
  }
 }

+void _bs_qmm_dispatch(
+    array& out,
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& biases,
+    const array& lhs_indices,
+    const array& rhs_indices,
+    int bits,
+    int group_size,
+    bool transposed_w) {
+  int K = x.shape(-1);
+  int M = x.shape(-2);
+  int N = out.shape(-1);
+
+  int w_els = w.shape(-1) * w.shape(-2);
+  int g_els = scales.shape(-1) * scales.shape(-2);
+
+  const uint32_t* lhs_indices_data = lhs_indices.data<uint32_t>();
+  const uint32_t* rhs_indices_data = rhs_indices.data<uint32_t>();
+
+  for (int i = 0; i < lhs_indices.size(); i++) {
+    int x_idx = lhs_indices_data[elem_to_loc(i, lhs_indices)];
+    int w_idx = rhs_indices_data[elem_to_loc(i, rhs_indices)];
+
+    switch (x.dtype()) {
+      case float32:
+        _qmm_dispatch_typed<float>(
+            out.data<float>() + i * M * N,
+            x.data<float>() + elem_to_loc(x_idx * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
+            scales.data<float>() + elem_to_loc(w_idx * g_els, scales),
+            biases.data<float>() + elem_to_loc(w_idx * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      case float16:
+        _qmm_dispatch_typed<float16_t>(
+            out.data<float16_t>() + i * M * N,
+            x.data<float16_t>() + elem_to_loc(x_idx * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
+            scales.data<float16_t>() + elem_to_loc(w_idx * g_els, scales),
+            biases.data<float16_t>() + elem_to_loc(w_idx * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      case bfloat16:
+        _qmm_dispatch_typed<bfloat16_t>(
+            out.data<bfloat16_t>() + i * M * N,
+            x.data<bfloat16_t>() + elem_to_loc(x_idx * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
+            scales.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, scales),
+            biases.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      default:
+        throw std::invalid_argument(
+            "[quantized_matmul] only floating types are supported");
+    }
+  }
+}
+
 } // namespace

 void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
@@ -282,4 +357,45 @@ void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }

+void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 6);
+
+  auto& x_pre = inputs[0];
+  auto& w_pre = inputs[1];
+  auto& scales_pre = inputs[2];
+  auto& biases_pre = inputs[3];
+  auto& lhs_indices = inputs[4];
+  auto& rhs_indices = inputs[5];
+
+  auto ensure_row_contiguous_last_dims = [](const array& arr) {
+    auto stride_0 = arr.strides()[arr.ndim() - 2];
+    auto stride_1 = arr.strides()[arr.ndim() - 1];
+    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
+      return arr;
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      return arr_copy;
+    }
+  };
+
+  auto x = ensure_row_contiguous_last_dims(x_pre);
+  auto w = ensure_row_contiguous_last_dims(w_pre);
+  auto scales = ensure_row_contiguous_last_dims(scales_pre);
+  auto biases = ensure_row_contiguous_last_dims(biases_pre);
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  _bs_qmm_dispatch(
+      out,
+      x,
+      w,
+      scales,
+      biases,
+      lhs_indices,
+      rhs_indices,
+      group_size_,
+      bits_,
+      transpose_);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -3,7 +3,6 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/lapack_helper.h"
-#include "mlx/linalg.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -145,12 +144,4 @@ void SVD::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  svd_impl(inputs[0], outputs[0], outputs[1], outputs[2]);
 }

-std::pair<std::vector<array>, std::vector<int>> SVD::vmap(
-    const std::vector<array>& inputs,
-    const std::vector<int>& axes) {
-  auto ax = axes[0] >= 0 ? 0 : -1;
-  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
-  return {{linalg::svd(a, stream())}, {ax, ax, ax}};
-}
-
 } // namespace mlx::core
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -1,27 +1,125 @@
-add_custom_command(
-    OUTPUT  compiled_preamble.cpp
+function(make_jit_source SRC_FILE)
+  # This function takes a metal header file,
+  # runs the C preprocessesor on it, and makes
+  # the processed contents available as a string in a C++ function
+  # mlx::core::metal::${SRC_NAME}()
+  #
+  # To use the function, declare it in jit/includes.h and
+  # include jit/includes.h.
+  #
+  # Additional arguments to this function are treated as dependencies
+  # in the Cmake build system.
+  get_filename_component(SRC_NAME ${SRC_FILE} NAME)
+  add_custom_command(
+    OUTPUT  jit/${SRC_NAME}.cpp
    COMMAND /bin/bash
              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
-              ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
+              ${CMAKE_CURRENT_BINARY_DIR}/jit
              ${CMAKE_C_COMPILER}
              ${PROJECT_SOURCE_DIR}
+              ${SRC_FILE}
+              "-D${MLX_METAL_VERSION}"
    DEPENDS make_compiled_preamble.sh
-            kernels/compiled_preamble.h
-            kernels/unary.h
-            kernels/binary.h
-)
+            kernels/${SRC_FILE}.h
+            ${ARGN}
+  )
+  add_custom_target(${SRC_NAME} DEPENDS jit/${SRC_NAME}.cpp)
+  add_dependencies(mlx ${SRC_NAME})
+  target_sources(
+    mlx
+    PRIVATE
+    ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp
+  )
+endfunction(make_jit_source)

-add_custom_target(
-  compiled_preamble
-  DEPENDS compiled_preamble.cpp
+make_jit_source(
+  utils
+  kernels/bf16.h
+  kernels/complex.h
+  kernels/defines.h
 )
+make_jit_source(
+  unary_ops
+  kernels/erf.h
+  kernels/expm1f.h
+)
+make_jit_source(binary_ops)
+make_jit_source(ternary_ops)
+make_jit_source(
+  reduce_utils
+  kernels/atomic.h
+  kernels/reduction/ops.h
+)
+make_jit_source(scatter)
+make_jit_source(gather)

-add_dependencies(mlx compiled_preamble)
+if (MLX_METAL_JIT) 
+  target_sources(
+    mlx
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp
+  )
+  make_jit_source(arange)
+  make_jit_source(copy)
+  make_jit_source(unary)
+  make_jit_source(binary)
+  make_jit_source(binary_two)
+  make_jit_source(ternary)
+  make_jit_source(softmax)
+  make_jit_source(scan)
+  make_jit_source(sort)
+  make_jit_source(
+    reduce
+    kernels/reduction/reduce_all.h
+    kernels/reduction/reduce_col.h
+    kernels/reduction/reduce_row.h
+  )
+  make_jit_source(
+    steel/gemm/gemm
+    kernels/steel/utils.h
+    kernels/steel/gemm/loader.h
+    kernels/steel/gemm/mma.h
+    kernels/steel/gemm/params.h
+    kernels/steel/gemm/transforms.h
+  )
+  make_jit_source(steel/gemm/kernels/steel_gemm_fused)
+  make_jit_source(
+    steel/gemm/kernels/steel_gemm_masked
+    kernels/steel/defines.h
+  )
+  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
+  make_jit_source(
+    steel/conv/conv
+    kernels/steel/utils.h
+    kernels/steel/defines.h
+    kernels/steel/gemm/mma.h
+    kernels/steel/gemm/transforms.h
+    kernels/steel/conv/params.h
+    kernels/steel/conv/loader.h
+    kernels/steel/conv/loaders/loader_channel_l.h
+    kernels/steel/conv/loaders/loader_channel_n.h
+  )
+  make_jit_source(
+    steel/conv/kernels/steel_conv
+  )
+  make_jit_source(
+    steel/conv/kernels/steel_conv_general
+    kernels/steel/defines.h
+    kernels/steel/conv/loaders/loader_general.h
+  )
+else()
+  target_sources(
+    mlx
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp
+  )
+endif()

 target_sources(
  mlx
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
@@ -40,7 +138,8 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
 )

 if (NOT MLX_METAL_PATH)
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -0,0 +1,322 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;
+
+void binary_op(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const std::string op) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, outputs[0], bopt, true);
+  set_binary_op_output_data(a, b, outputs[1], bopt, true);
+
+  auto& out = outputs[0];
+  if (out.size() == 0) {
+    return;
+  }
+
+  // Try to collapse contiguous dims
+  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
+  auto& strides_a = strides[0];
+  auto& strides_b = strides[1];
+  auto& strides_out = strides[2];
+
+  std::string kernel_name;
+  {
+    std::ostringstream kname;
+    switch (bopt) {
+      case BinaryOpType::ScalarScalar:
+        kname << "ss";
+        break;
+      case BinaryOpType::ScalarVector:
+        kname << "sv";
+        break;
+      case BinaryOpType::VectorScalar:
+        kname << "vs";
+        break;
+      case BinaryOpType::VectorVector:
+        kname << "vv";
+        break;
+      case BinaryOpType::General:
+        kname << "g";
+        if (shape.size() <= MAX_BINARY_SPECIALIZED_DIMS) {
+          kname << shape.size();
+        } else {
+          kname << "n";
+        }
+        break;
+    }
+    kname << op << type_to_name(a);
+    kernel_name = kname.str();
+  }
+
+  auto& s = out.primitive().stream();
+  auto& d = metal::device(s.device);
+
+  auto kernel = get_binary_two_kernel(d, kernel_name, a, outputs[0]);
+
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder->setComputePipelineState(kernel);
+
+  // - If a is donated it goes to the first output
+  // - If b is donated it goes to the first output if a was not donated
+  //   otherwise it goes to the second output
+  bool donate_a = a.data_shared_ptr() == nullptr;
+  bool donate_b = b.data_shared_ptr() == nullptr;
+  compute_encoder.set_input_array(donate_a ? outputs[0] : a, 0);
+  compute_encoder.set_input_array(
+      donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, 1);
+  compute_encoder.set_output_array(outputs[0], 2);
+  compute_encoder.set_output_array(outputs[1], 3);
+
+  if (bopt == BinaryOpType::General) {
+    auto ndim = shape.size();
+    if (ndim > 3) {
+      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 4);
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 5);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 6);
+    } else {
+      // The shape is implicit in the grid for <= 3D
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
+    }
+
+    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
+      compute_encoder->setBytes(&ndim, sizeof(int), 7);
+    }
+
+    // Launch up to 3D grid of threads
+    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
+    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
+    size_t rest = out.size() / (dim0 * dim1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size != 1024) {
+      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
+    }
+    auto group_dims = get_block_dims(dim0, dim1, rest);
+    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
+  } else {
+    // Launch a 1D grid of threads
+    size_t nthreads = out.data_size();
+    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size > nthreads) {
+      thread_group_size = nthreads;
+    }
+    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
+  }
+}
+
+void binary_op(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt, true);
+  if (out.size() == 0) {
+    return;
+  }
+
+  // Try to collapse contiguous dims
+  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
+  auto& strides_a = strides[0];
+  auto& strides_b = strides[1];
+  auto& strides_out = strides[2];
+
+  std::string kernel_name;
+  {
+    std::ostringstream kname;
+    switch (bopt) {
+      case BinaryOpType::ScalarScalar:
+        kname << "ss";
+        break;
+      case BinaryOpType::ScalarVector:
+        kname << "sv";
+        break;
+      case BinaryOpType::VectorScalar:
+        kname << "vs";
+        break;
+      case BinaryOpType::VectorVector:
+        kname << "vv";
+        break;
+      case BinaryOpType::General:
+        kname << "g";
+        if (shape.size() <= MAX_BINARY_SPECIALIZED_DIMS) {
+          kname << shape.size();
+        } else {
+          kname << "n";
+        }
+        break;
+    }
+    kname << op << type_to_name(a);
+    kernel_name = kname.str();
+  }
+
+  auto& s = out.primitive().stream();
+  auto& d = metal::device(s.device);
+
+  auto kernel = get_binary_kernel(d, kernel_name, a, out);
+  auto& compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder->setComputePipelineState(kernel);
+  bool donate_a = a.data_shared_ptr() == nullptr;
+  bool donate_b = b.data_shared_ptr() == nullptr;
+  compute_encoder.set_input_array(donate_a ? out : a, 0);
+  compute_encoder.set_input_array(donate_b ? out : b, 1);
+  compute_encoder.set_output_array(out, 2);
+
+  if (bopt == BinaryOpType::General) {
+    auto ndim = shape.size();
+    if (ndim > 3) {
+      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 3);
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
+    } else {
+      // The shape is implicit in the grid for <= 3D
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 3);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 4);
+    }
+
+    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
+      compute_encoder->setBytes(&ndim, sizeof(int), 6);
+    }
+
+    // Launch up to 3D grid of threads
+    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
+    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
+    size_t rest = out.size() / (dim0 * dim1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size != 1024) {
+      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
+    }
+    auto group_dims = get_block_dims(dim0, dim1, rest);
+    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
+  } else {
+    // Launch a 1D grid of threads
+    size_t nthreads =
+        bopt == BinaryOpType::General ? out.size() : out.data_size();
+    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size > nthreads) {
+      thread_group_size = nthreads;
+    }
+    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
+  }
+}
+
+void Add::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "add");
+}
+
+void ArcTan2::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "arctan2");
+}
+
+void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
+  switch (op_) {
+    case BitwiseBinary::And:
+      binary_op(inputs, out, "bitwise_and");
+      break;
+    case BitwiseBinary::Or:
+      binary_op(inputs, out, "bitwise_or");
+      break;
+    case BitwiseBinary::Xor:
+      binary_op(inputs, out, "bitwise_xor");
+      break;
+    case BitwiseBinary::LeftShift:
+      binary_op(inputs, out, "left_shift");
+      break;
+    case BitwiseBinary::RightShift:
+      binary_op(inputs, out, "right_shift");
+      break;
+  }
+}
+
+void Divide::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "div");
+}
+
+void DivMod::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  binary_op(inputs, outputs, "divmod");
+}
+
+void Remainder::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "rem");
+}
+
+void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, equal_nan_ ? "naneq" : "eq");
+}
+
+void Greater::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "ge");
+}
+
+void GreaterEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "geq");
+}
+
+void Less::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "le");
+}
+
+void LessEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "leq");
+}
+
+void LogicalAnd::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "land");
+}
+
+void LogicalOr::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "lor");
+}
+
+void LogAddExp::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "lae");
+}
+
+void Maximum::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "max");
+}
+
+void Minimum::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "min");
+}
+
+void Multiply::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "mul");
+}
+
+void NotEqual::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "neq");
+}
+
+void Power::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "pow");
+}
+
+void Subtract::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(inputs, out, "sub");
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -4,8 +4,8 @@

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/metal/compiled_preamble.h"
 #include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"
@@ -190,7 +190,8 @@ void Compiled::eval_gpu(
  // If not we have to build it ourselves
  if (lib == nullptr) {
    std::ostringstream kernel;
-    kernel << metal::get_kernel_preamble() << std::endl;
+    kernel << metal::utils() << metal::unary_ops() << metal::binary_ops()
+           << metal::ternary_ops();
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
@@ -336,7 +337,7 @@ void Compiled::eval_gpu(
    MTL::Size grid_dims(nthreads, 1, 1);
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
@@ -347,7 +348,7 @@ void Compiled::eval_gpu(
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/compiled_preamble.h
+++ b/mlx/backend/metal/compiled_preamble.h
@@ -1,9 +0,0 @@
-// Copyright © 2023-24 Apple Inc.
-
-#pragma once
-
-namespace mlx::core::metal {
-
-const char* get_kernel_preamble();
-
-}
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -7,6 +7,7 @@

 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/kernels/steel/conv/params.h"
 #include "mlx/backend/metal/matmul.h"
@@ -59,7 +60,7 @@ void explicit_gemm_conv_ND_gpu(
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);

-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);

  // Reshape weight
  std::vector<int> wt_reshape{implicit_K, implicit_N};
@@ -137,7 +138,7 @@ void explicit_gemm_conv_group_ND_gpu(
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);

-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);

  // Transpose kernel weights so that we can slice them by contiguous chunks
  // of channel groups.
@@ -247,7 +248,7 @@ void slow_conv_2D_gpu(
  compute_encoder.set_output_array(out, 2);

  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_gpu(
@@ -257,15 +258,19 @@ void implicit_gemm_conv_2D_gpu(
    const array& wt,
    array out,
    const MLXConvParams<2>& conv_params) {
+  const int groups = conv_params.groups;
+  const int C_per_group = conv_params.C / conv_params.groups;
+  const int O_per_group = conv_params.O / conv_params.groups;
+
  // Deduce implicit gemm size
-  int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
-  int implicit_N = conv_params.O;
-  int implicit_K = conv_params.wS[0] * conv_params.wS[1] * conv_params.C;
+  const int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
+  const int implicit_N = O_per_group;
+  const int implicit_K = conv_params.wS[0] * conv_params.wS[1] * C_per_group;

  // Determine block and warp tiles
  int wm = 2, wn = 2;

-  int bm = implicit_M >= 8192 && conv_params.C >= 64 ? 64 : 32;
+  int bm = implicit_M >= 8192 && C_per_group >= 64 ? 64 : 32;
  int bn = (bm == 64 || implicit_N >= 64) ? 64 : 32;
  int bk = 16;

@@ -281,15 +286,15 @@ void implicit_gemm_conv_2D_gpu(

  // Fix small channel specialization
  int n_channel_specialization = 0;
-  int channel_k_iters = ((conv_params.C + bk - 1) / bk);
+  int channel_k_iters = ((C_per_group + bk - 1) / bk);
  int gemm_k_iters = conv_params.wS[0] * conv_params.wS[1] * channel_k_iters;

-  if (conv_params.C <= 2) {
+  if (C_per_group <= 2) {
    gemm_k_iters = (implicit_K + bk - 1) / bk;
-    n_channel_specialization = conv_params.C;
-  } else if (conv_params.C <= 4) {
+    n_channel_specialization = C_per_group;
+  } else if (C_per_group <= 4) {
    gemm_k_iters = ((conv_params.wS[0] * conv_params.wS[1] * 4) + bk - 1) / bk;
-    n_channel_specialization = conv_params.C;
+    n_channel_specialization = C_per_group;
  }

  bool small_filter = (!n_channel_specialization) &&
@@ -331,7 +336,17 @@ void implicit_gemm_conv_2D_gpu(

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel = get_steel_conv_kernel(
+      d,
+      kname.str(),
+      out,
+      bm,
+      bn,
+      bk,
+      wm,
+      wn,
+      n_channel_specialization,
+      small_filter);
  compute_encoder->setComputePipelineState(kernel);

  // Deduce grid launch dimensions
@@ -340,7 +355,7 @@ void implicit_gemm_conv_2D_gpu(
  size_t grid_dim_x = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
-  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, 1);
+  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, groups);

  // Encode arrays
  compute_encoder.set_input_array(in, 0);
@@ -352,7 +367,7 @@ void implicit_gemm_conv_2D_gpu(
  compute_encoder->setBytes(&gemm_params, sizeof(ImplicitGemmConv2DParams), 4);

  // Launch kernel
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_general_gpu(
@@ -484,7 +499,8 @@ void implicit_gemm_conv_2D_general_gpu(

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel =
+      get_steel_conv_general_kernel(d, kname.str(), out, bm, bn, bk, wm, wn);
  compute_encoder->setComputePipelineState(kernel);

  // Deduce grid launch dimensions
@@ -512,7 +528,7 @@ void implicit_gemm_conv_2D_general_gpu(
      base_w.data(), sizeof(Conv2DGeneralBaseInfo) * base_w.size(), 7);

  // Launch kernel
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }

 void winograd_conv_2D_gpu(
@@ -613,7 +629,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, bo, 1);
    MTL::Size grid_dims = MTL::Size(O_c / bo, 1, 1);

-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }

  // Do input transform
@@ -641,7 +657,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }

  // Do batched gemm
@@ -689,7 +705,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }
 }

@@ -703,6 +719,7 @@ void conv_2D_gpu(
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
+    const int groups,
    bool flip,
    std::vector<array>& copies) {
  // Make conv params
@@ -718,12 +735,12 @@ void conv_2D_gpu(
      /* const int kdil[NDIM] = */ {wt_dilation[0], wt_dilation[1]},
      /* const int idil[NDIM] = */ {in_dilation[0], in_dilation[1]},
      /* const size_t in_strides[NDIM + 2] = */
-      {in.strides()[0], in.strides()[1], in.strides()[2], in.strides()[3]},
+      {in.strides(0), in.strides(1), in.strides(2), in.strides(3)},
      /* const size_t wt_strides[NDIM + 2] = */
-      {wt.strides()[0], wt.strides()[1], wt.strides()[2], wt.strides()[3]},
+      {wt.strides(0), wt.strides(1), wt.strides(2), wt.strides(3)},
      /* const size_t out_strides[NDIM + 2] = */
-      {out.strides()[0], out.strides()[1], out.strides()[2], out.strides()[3]},
-      /* const int groups = */ 1,
+      {out.strides(0), out.strides(1), out.strides(2), out.strides(3)},
+      /* const int groups = */ groups,
      /* const bool flip = */ flip,
  };

@@ -735,6 +752,18 @@ void conv_2D_gpu(
  bool channels_large = (conv_params.C + conv_params.O) >= 512;
  bool channels_med = (conv_params.C + conv_params.O) >= 256;

+  if (groups > 1) {
+    const int C_per_group = conv_params.C / groups;
+    const int O_per_group = conv_params.O / groups;
+
+    if (is_idil_one && (C_per_group <= 4 || C_per_group % 16 == 0) &&
+        (O_per_group <= 16 || O_per_group % 16 == 0)) {
+      return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
+    } else {
+      return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
+    }
+  }
+
  // Direct to winograd conv
  if (!flip && is_stride_one && is_kdil_one && is_idil_one &&
      conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
@@ -759,6 +788,56 @@ void conv_2D_gpu(
  }
 }

+void conv_3D_gpu(
+    const Stream& s,
+    metal::Device& d,
+    const array& in,
+    const array& wt,
+    array out,
+    const std::vector<int>& padding,
+    const std::vector<int>& wt_strides,
+    const std::vector<int>& wt_dilation,
+    const std::vector<int>& in_dilation,
+    bool flip,
+    std::vector<array>& copies) {
+  // Make conv params
+  MLXConvParams<3> conv_params{
+      /* const int  N = */ in.shape(0),
+      /* const int  C = */ in.shape(4),
+      /* const int  O = */ wt.shape(0),
+      /* const int iS[NDIM] = */ {in.shape(1), in.shape(2), in.shape(3)},
+      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2), wt.shape(3)},
+      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2), out.shape(3)},
+      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1], wt_strides[2]},
+      /* const int pad[NDIM] = */ {padding[0], padding[1], padding[2]},
+      /* const int kdil[NDIM] = */
+      {wt_dilation[0], wt_dilation[1], wt_dilation[2]},
+      /* const int idil[NDIM] = */
+      {in_dilation[0], in_dilation[1], in_dilation[2]},
+      /* const size_t in_strides[NDIM + 2] = */
+      {in.strides()[0],
+       in.strides()[1],
+       in.strides()[2],
+       in.strides()[3],
+       in.strides()[4]},
+      /* const size_t wt_strides[NDIM + 2] = */
+      {wt.strides()[0],
+       wt.strides()[1],
+       wt.strides()[2],
+       wt.strides()[3],
+       wt.strides()[4]},
+      /* const size_t out_strides[NDIM + 2] = */
+      {out.strides()[0],
+       out.strides()[1],
+       out.strides()[2],
+       out.strides()[3],
+       out.strides()[4]},
+      /* const int groups = */ 1,
+      /* const bool flip = */ flip,
+  };
+  return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
+}
+
 } // namespace

 void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -783,8 +862,23 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
    wt = arr_copy;
  }

+  // 3D conv
+  if (out.ndim() == 5) {
+    conv_3D_gpu(
+        s,
+        d,
+        in,
+        wt,
+        out,
+        padding_,
+        kernel_strides_,
+        kernel_dilation_,
+        input_dilation_,
+        flip_,
+        copies);
+  }
  // 2D conv
-  if (out.ndim() == 4) {
+  else if (out.ndim() == 4) {
    conv_2D_gpu(
        s,
        d,
@@ -795,6 +889,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
+        groups_,
        flip_,
        copies);
  }
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -4,12 +4,14 @@

 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

+constexpr int MAX_COPY_SPECIALIZED_DIMS = 5;
+
 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
@@ -62,27 +64,34 @@ void copy_gpu_inplace(
  auto& strides_out_ = strides[1];

  auto& d = metal::device(s.device);
-  std::ostringstream kname;
-  switch (ctype) {
-    case CopyType::Scalar:
-      kname << "scopy";
-      break;
-    case CopyType::Vector:
-      kname << "vcopy";
-      break;
-    case CopyType::General:
-      kname << "gcopy";
-      break;
-    case CopyType::GeneralGeneral:
-      kname << "ggcopy";
-      break;
+  std::string kernel_name;
+  {
+    std::ostringstream kname;
+    switch (ctype) {
+      case CopyType::Scalar:
+        kname << "s";
+        break;
+      case CopyType::Vector:
+        kname << "v";
+        break;
+      case CopyType::General:
+        kname << "g";
+        break;
+      case CopyType::GeneralGeneral:
+        kname << "gg";
+        break;
+    }
+    if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
+        shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
+      kname << shape.size();
+    }
+    kname << "_copy";
+    kname << type_to_name(in) << type_to_name(out);
+    kernel_name = kname.str();
  }
-  kname << type_to_name(in) << type_to_name(out);
-  if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
-      shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
-    kname << "_" << shape.size();
-  }
-  auto kernel = d.get_kernel(kname.str());
+
+  auto kernel = get_copy_kernel(d, kernel_name, in, out);
+
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  bool donate_in = in.data_shared_ptr() == nullptr;
@@ -106,7 +115,7 @@ void copy_gpu_inplace(
      set_vector_bytes(compute_encoder, strides_out, ndim, 4);
    }

-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
+    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
      compute_encoder->setBytes(&ndim, sizeof(int), 5);
    }

@@ -126,7 +135,7 @@ void copy_gpu_inplace(

    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    size_t nthreads = out.data_size();
    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
@@ -135,7 +144,7 @@ void copy_gpu_inplace(
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -25,9 +25,18 @@ namespace {

 // TODO nicer way to set this or possibly expose as an environment variable
 constexpr int MAX_BUFFERS_PER_QUEUE = 12;
+constexpr int MAX_DISPATCHES_PER_ENCODER = 2;

 constexpr const char* default_mtllib_path = METAL_PATH;

+constexpr auto get_metal_version() {
+#if defined METAL_3_1
+  return MTL::LanguageVersion3_1;
+#else
+  return MTL::LanguageVersion3_0;
+#endif
+}
+
 auto load_device() {
  auto devices = MTL::CopyAllDevices();
  auto device = static_cast<MTL::Device*>(devices->object(0))
@@ -37,7 +46,6 @@ auto load_device() {
  }
  return device;
 }
-
 std::pair<MTL::Library*, NS::Error*> load_library_from_path(
    MTL::Device* device,
    const char* path) {
@@ -116,6 +124,33 @@ MTL::Library* load_library(

 } // namespace

+void CommandEncoder::dispatchThreadgroups(
+    MTL::Size grid_dims,
+    MTL::Size group_dims) {
+  num_dispatches++;
+  enc->dispatchThreadgroups(grid_dims, group_dims);
+  maybe_split();
+}
+
+void CommandEncoder::dispatchThreads(
+    MTL::Size grid_dims,
+    MTL::Size group_dims) {
+  num_dispatches++;
+  enc->dispatchThreads(grid_dims, group_dims);
+  maybe_split();
+}
+
+void CommandEncoder::maybe_split() {
+  if (num_dispatches > MAX_DISPATCHES_PER_ENCODER && !concurrent) {
+    enc->endEncoding();
+    enc->release();
+    num_dispatches = 0;
+    outputs.clear();
+    enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
+    enc->retain();
+  }
+}
+
 Device::Device() {
  auto pool = new_scoped_memory_pool();
  device_ = load_device();
@@ -130,9 +165,6 @@ Device::~Device() {
  for (auto& b : buffer_map_) {
    b.second.second->release();
  }
-  for (auto& e : encoder_map_) {
-    (*e.second)->release();
-  }
  for (auto& k : kernel_map_) {
    k.second->release();
  }
@@ -169,27 +201,26 @@ void Device::increment_command_buffer_ops(int index) {

 MTL::CommandBuffer* Device::get_command_buffer(int index) {
  auto bit = buffer_map_.find(index);
-  return (bit == buffer_map_.end()) ? nullptr : bit->second.second;
-}
+  if (bit == buffer_map_.end()) {
+    auto qit = queue_map_.find(index);
+    if (qit == queue_map_.end()) {
+      throw std::runtime_error(
+          "[metal::Device] Attempting to get command buffer for invalid queue.");
+    }

-MTL::CommandBuffer* Device::new_command_buffer(int index) {
-  auto qit = queue_map_.find(index);
-  if (qit == queue_map_.end()) {
-    throw std::runtime_error(
-        "[metal::Device] Attempting to get command buffer for invalid queue.");
+    auto cb = qit->second->commandBufferWithUnretainedReferences();
+
+    if (!cb) {
+      throw std::runtime_error(
+          "[metal::Device] Unable to create new command buffer");
+    }
+
+    // Increment ref count so the buffer is not garbage collected
+    cb->retain();
+
+    bit = buffer_map_.insert({index, {0, cb}}).first;
  }
-
-  auto cb = qit->second->commandBufferWithUnretainedReferences();
-
-  if (!cb) {
-    throw std::runtime_error(
-        "[metal::Device] Unable to create new command buffer");
-  }
-
-  // Increment ref count so the buffer is not garbage collected
-  cb->retain();
-
-  return buffer_map_.insert({index, {0, cb}}).first->second.second;
+  return bit->second.second;
 }

 void Device::commit_command_buffer(int index) {
@@ -200,25 +231,15 @@ void Device::commit_command_buffer(int index) {
 }

 void Device::end_encoding(int index) {
-  auto eit = encoder_map_.find(index);
-  if (eit != encoder_map_.end()) {
-    (*eit->second)->endEncoding();
-    (*eit->second)->release();
-    encoder_map_.erase(eit);
-  }
+  encoder_map_.erase(index);
 }

 CommandEncoder& Device::get_command_encoder(int index) {
  auto eit = encoder_map_.find(index);
  if (eit == encoder_map_.end()) {
    auto cb = get_command_buffer(index);
-    auto compute_encoder =
-        cb->computeCommandEncoder(MTL::DispatchTypeConcurrent);
-    // Increment ref count so the buffer is not garbage collected
-    compute_encoder->retain();
-    eit = encoder_map_
-              .emplace(index, std::make_unique<CommandEncoder>(compute_encoder))
-              .first;
+    eit =
+        encoder_map_.emplace(index, std::make_unique<CommandEncoder>(cb)).first;
  }
  return *(eit->second);
 }
@@ -262,7 +283,11 @@ MTL::Library* Device::get_library_(const std::string& source_string) {
      NS::String::string(source_string.c_str(), NS::ASCIIStringEncoding);

  NS::Error* error = nullptr;
-  auto mtl_lib = device_->newLibrary(ns_code, nullptr, &error);
+  auto options = MTL::CompileOptions::alloc()->init();
+  options->setFastMathEnabled(false);
+  options->setLanguageVersion(get_metal_version());
+  auto mtl_lib = device_->newLibrary(ns_code, options, &error);
+  options->release();

  // Throw error if unable to compile library
  if (!mtl_lib) {
@@ -344,7 +369,6 @@ MTL::Function* Device::get_function_(
  }

  mtl_func_consts->release();
-  desc->release();

  return mtl_function;
 }
@@ -513,11 +537,13 @@ MTL::ComputePipelineState* Device::get_kernel(
  // Compile kernel to compute pipeline
  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
  auto kernel = get_kernel_(kname, mtl_function, mtl_linked_funcs);
+
  mtl_function->release();
  mtl_linked_funcs->release();

  // Add kernel to cache
  kernel_map_.insert({kname, kernel});
+
  return kernel;
 }

--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -37,8 +37,10 @@ using MTLFCList =
    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;

 struct CommandEncoder {
-  CommandEncoder(MTL::ComputeCommandEncoder* enc)
-      : enc(enc), concurrent(false) {};
+  CommandEncoder(MTL::CommandBuffer* cbuf) : cbuf(cbuf) {
+    enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
+    enc->retain();
+  };
  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;

@@ -61,7 +63,7 @@ struct CommandEncoder {
    return enc;
  }

-  void set_input_array(const array& a, int idx, int offset = 0) {
+  void set_input_array(const array& a, int idx, int64_t offset = 0) {
    auto r_buf =
        static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
    if (auto it = outputs.find(r_buf); it != outputs.end()) {
@@ -78,7 +80,7 @@ struct CommandEncoder {
    enc->setBuffer(a_buf, base_offset, idx);
  }

-  void set_output_array(array& a, int idx, int offset = 0) {
+  void set_output_array(array& a, int idx, int64_t offset = 0) {
    // Add barriers before adding the output to the output set
    set_input_array(a, idx, offset);
    auto buf = static_cast<MTL::Resource*>(a.buffer().ptr());
@@ -89,13 +91,25 @@ struct CommandEncoder {
    }
  }

+  void dispatchThreadgroups(MTL::Size grid_dims, MTL::Size group_dims);
+  void dispatchThreads(MTL::Size grid_dims, MTL::Size group_dims);
+
  ConcurrentContext start_concurrent() {
    return ConcurrentContext(*this);
  }

+  ~CommandEncoder() {
+    enc->endEncoding();
+    enc->release();
+  }
+
 private:
+  void maybe_split();
+
+  int num_dispatches{0};
+  MTL::CommandBuffer* cbuf;
  MTL::ComputeCommandEncoder* enc;
-  bool concurrent;
+  bool concurrent{false};
  std::unordered_set<MTL::Resource*> outputs;
  std::unordered_set<MTL::Resource*> concurrent_outputs;
 };
@@ -112,7 +126,6 @@ class Device {
  };

  void new_queue(int index);
-  MTL::CommandBuffer* new_command_buffer(int index);
  MTL::CommandBuffer* get_command_buffer(int index);
  int get_command_buffer_ops(int index);
  void increment_command_buffer_ops(int index);
--- a/mlx/backend/metal/fft.cpp
+++ b/mlx/backend/metal/fft.cpp
@@ -97,7 +97,7 @@ void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {

    auto group_dims = MTL::Size(1, m, 1);
    auto grid_dims = MTL::Size(batch, m, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
  d.get_command_buffer(s.index)->addCompletedHandler(
      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,24 +1,35 @@
 // Copyright © 2023-2024 Apple Inc.
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <sstream>
+#include <fmt/format.h>

-#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/compiled.h"
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/jit/includes.h"
+#include "mlx/backend/metal/jit/indexing.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

 namespace mlx::core {

-namespace {
+constexpr int METAL_MAX_INDEX_ARRAYS = 20;

-constexpr int METAL_MAX_INDEX_ARRAYS = 10;
-
-} // namespace
+std::pair<std::string, std::string> make_index_args(
+    const std::string& idx_type,
+    int nidx) {
+  std::ostringstream idx_args;
+  std::ostringstream idx_arr;
+  for (int i = 0; i < nidx; ++i) {
+    idx_args << fmt::format(
+        "const device {0} *idx{1} [[buffer({2})]],", idx_type, i, 20 + i);
+    idx_arr << fmt::format("idx{0}", i);
+    if (i < nidx - 1) {
+      idx_args << "\n";
+      idx_arr << ",";
+    }
+  }
+  return {idx_args.str(), idx_arr.str()};
+}

 void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& src = inputs[0];
@@ -42,15 +53,41 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  size_t ndim = src.ndim();

-  std::ostringstream kname;
+  std::string lib_name;
+  std::string kernel_name;
  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
-  kname << "gather" << type_to_name(src) << idx_type_name << "_" << nidx;
-  if (idx_ndim <= 1) {
-    kname << "_" << idx_ndim;
+  {
+    std::ostringstream kname;
+    kname << "gather" << type_to_name(out) << idx_type_name << "_" << nidx
+          << "_" << idx_ndim;
+    lib_name = kname.str();
+    kernel_name = lib_name;
+  }
+
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::gather();
+    std::string out_type_str = get_type_string(out.dtype());
+    std::string idx_type_str =
+        nidx ? get_type_string(inputs[1].dtype()) : "bool";
+    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);
+
+    // Index dimension specializations
+    kernel_source << fmt::format(
+        gather_kernels,
+        type_to_name(out) + idx_type_name,
+        out_type_str,
+        idx_type_str,
+        nidx,
+        idx_args,
+        idx_arr,
+        idx_ndim);
+    lib = d.get_library(lib_name, kernel_source.str());
  }

  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel = d.get_kernel(kernel_name, lib);
  compute_encoder->setComputePipelineState(kernel);

  size_t slice_size = 1;
@@ -102,12 +139,12 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder->setBytes(&idx_ndim, sizeof(int), 9);

  // Set index buffers
-  for (int i = 1; i < nidx + 1; ++i) {
-    compute_encoder.set_input_array(inputs[i], 20 + i);
+  for (int i = 0; i < nidx; ++i) {
+    compute_encoder.set_input_array(inputs[i + 1], 20 + i);
  }

  // Launch grid
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -139,10 +176,6 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

-  // Get kernel name
-  std::ostringstream kname;
-  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
-
  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  bool index_nd1_specialization = (idx_ndim == 1);

@@ -159,32 +192,86 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    index_nd1_specialization &= inputs[i].flags().row_contiguous;
  }

-  if (index_nd1_specialization) {
-    kname << "scatter_1d_index" << type_to_name(out) << idx_type_name;
-  } else {
-    kname << "scatter" << type_to_name(out) << idx_type_name;
-  }
+  std::string lib_name;
+  std::string kernel_name;
+  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
+  std::string op_name;
  switch (reduce_type_) {
    case Scatter::None:
-      kname << "_none";
+      op_name = "none";
      break;
    case Scatter::Sum:
-      kname << "_sum";
+      op_name = "sum";
      break;
    case Scatter::Prod:
-      kname << "_prod";
+      op_name = "prod";
      break;
    case Scatter::Max:
-      kname << "_max";
+      op_name = "max";
      break;
    case Scatter::Min:
-      kname << "_min";
+      op_name = "min";
      break;
  }
-  kname << "_" << nidx;
+
+  {
+    std::ostringstream kname;
+    if (index_nd1_specialization) {
+      kname << "scatter_1d_index" << type_to_name(out) << idx_type_name;
+    } else {
+      kname << "scatter" << type_to_name(out) << idx_type_name;
+    }
+    kname << "_" << op_name << "_" << nidx;
+    lib_name = kname.str();
+    kernel_name = kname.str();
+  }
+
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::reduce_utils()
+                  << metal::scatter();
+
+    std::string out_type_str = get_type_string(out.dtype());
+    std::string idx_type_str =
+        nidx ? get_type_string(inputs[1].dtype()) : "bool";
+    std::string op_type;
+    switch (reduce_type_) {
+      case Scatter::None:
+        op_type = "None";
+        break;
+      case Scatter::Sum:
+        op_type = "Sum<{0}>";
+        break;
+      case Scatter::Prod:
+        op_type = "Prod<{0}>";
+        break;
+      case Scatter::Max:
+        op_type = "Max<{0}>";
+        break;
+      case Scatter::Min:
+        op_type = "Min<{0}>";
+        break;
+    }
+    if (reduce_type_ != Scatter::None) {
+      op_type = fmt::format(op_type, out_type_str);
+    }
+    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);
+
+    kernel_source << fmt::format(
+        scatter_kernels,
+        type_to_name(out) + idx_type_name + "_" + op_name,
+        out_type_str,
+        idx_type_str,
+        op_type,
+        nidx,
+        idx_args,
+        idx_arr);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }

  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel = d.get_kernel(kernel_name, lib);

  auto& upd = inputs.back();
  size_t nthreads = upd.size();
@@ -209,14 +296,14 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setBytes(&upd_size, sizeof(size_t), 5);

    // Set index buffers
-    for (int i = 1; i < nidx + 1; ++i) {
-      compute_encoder.set_input_array(inputs[i], 20 + i);
+    for (int i = 0; i < nidx; ++i) {
+      compute_encoder.set_input_array(inputs[i + 1], 20 + i);
    }

    // Launch grid
    MTL::Size grid_dims = MTL::Size(upd_size, nthreads / upd_size, 1);
    MTL::Size group_dims = get_block_dims(upd_size, nthreads / upd_size, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);

  } else {
    // Collect all idx shapes and strides into one place
@@ -279,14 +366,14 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setBytes(&idx_ndim, sizeof(int), 13);

    // Set index buffers
-    for (int i = 1; i < nidx + 1; ++i) {
-      compute_encoder.set_input_array(inputs[i], 20 + i);
+    for (int i = 0; i < nidx; ++i) {
+      compute_encoder.set_input_array(inputs[i + 1], 20 + i);
    }

    // Launch grid
    MTL::Size grid_dims = MTL::Size(upd_size, nthreads / upd_size, 1);
    MTL::Size group_dims = get_block_dims(upd_size, nthreads / upd_size, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/jit/arange.h
+++ b/mlx/backend/metal/jit/arange.h
@@ -0,0 +1,9 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view arange_kernels = R"(
+template [[host_name("{0}")]] [[kernel]] void arange<{1}>(
+    constant const {1}& start,
+    constant const {1}& step,
+    device {1}* out,
+    uint index [[thread_position_in_grid]]);
+)";
--- a/mlx/backend/metal/jit/binary.h
+++ b/mlx/backend/metal/jit/binary.h
@@ -0,0 +1,87 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view binary_kernels = R"(
+template [[host_name("ss{0}")]] [[kernel]]
+void binary_ss<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("vs{0}")]] [[kernel]]
+void binary_vs<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("sv{0}")]] [[kernel]]
+void binary_sv<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("vv{0}")]] [[kernel]]
+void binary_vv<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("g4{0}")]] [[kernel]] void
+binary_g_nd<{1}, {2}, {3}, 4>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const int shape[4],
+    constant const size_t a_strides[4],
+    constant const size_t b_strides[4],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("g5{0}")]] [[kernel]] void
+binary_g_nd<{1}, {2}, {3}, 5>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const int shape[5],
+    constant const size_t a_strides[5],
+    constant const size_t b_strides[5],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+
+template [[host_name("g1{0}")]] [[kernel]] void
+binary_g_nd1<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("g2{0}")]] [[kernel]] void
+binary_g_nd2<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]);
+template [[host_name("g3{0}")]] [[kernel]] void
+binary_g_nd3<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+
+template [[host_name("gn{0}")]] [[kernel]]
+void binary_g<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+)";
--- a/mlx/backend/metal/jit/binary_two.h
+++ b/mlx/backend/metal/jit/binary_two.h
@@ -0,0 +1,98 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view binary_two_kernels = R"(
+template [[host_name("ss{0}")]] [[kernel]]
+void binary_ss<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("vs{0}")]] [[kernel]]
+void binary_vs<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("sv{0}")]] [[kernel]]
+void binary_sv<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("vv{0}")]] [[kernel]]
+void binary_vv<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    uint index [[thread_position_in_grid]]);
+
+template [[host_name("g4{0}")]] [[kernel]] void
+binary_g_nd<{1}, {2}, {3}, 4>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const int shape[4],
+    constant const size_t a_strides[4],
+    constant const size_t b_strides[4],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("g5{0}")]] [[kernel]] void
+binary_g_nd<{1}, {2}, {3}, 5>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const int shape[5],
+    constant const size_t a_strides[5],
+    constant const size_t b_strides[5],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+
+template [[host_name("g1{0}")]] [[kernel]] void
+binary_g_nd1<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("g2{0}")]] [[kernel]] void
+binary_g_nd2<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]);
+template [[host_name("g3{0}")]] [[kernel]] void
+binary_g_nd3<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+
+template [[host_name("gn{0}")]] [[kernel]]
+void binary_g<{1}, {2}, {3}>(
+    device const {1}* a,
+    device const {1}* b,
+    device {2}* c,
+    device {2}* d,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+)";
--- a/mlx/backend/metal/jit/copy.h
+++ b/mlx/backend/metal/jit/copy.h
@@ -0,0 +1,100 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view copy_kernels = R"(
+template [[host_name("s_{0}")]] [[kernel]] void copy_s<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    uint index [[thread_position_in_grid]]);
+template [[host_name("v_{0}")]] [[kernel]] void copy_v<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    uint index [[thread_position_in_grid]]);
+
+template [[host_name("g4_{0}")]] [[kernel]] void
+copy_g_nd<{1}, {2}, 4>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("gg4_{0}")]] [[kernel]] void
+copy_gg_nd<{1}, {2}, 4>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]);
+template [[host_name("g5_{0}")]] [[kernel]] void
+copy_g_nd<{1}, {2}, 5>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("gg5_{0}")]] [[kernel]] void
+copy_gg_nd<{1}, {2}, 5>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]);
+template [[host_name("g1_{0}")]] [[kernel]] void copy_g_nd1<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    uint index [[thread_position_in_grid]]);
+template [[host_name("g2_{0}")]] [[kernel]] void copy_g_nd2<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]);
+template [[host_name("g3_{0}")]] [[kernel]] void copy_g_nd3<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("gg1_{0}")]] [[kernel]] void
+copy_gg_nd1<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    constant const int64_t& dst_stride [[buffer(4)]],
+    uint index [[thread_position_in_grid]]);
+template [[host_name("gg2_{0}")]] [[kernel]] void
+copy_gg_nd2<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint2 index [[thread_position_in_grid]]);
+template [[host_name("gg3_{0}")]] [[kernel]] void
+copy_gg_nd3<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]);
+
+template [[host_name("g_{0}")]] [[kernel]] void copy_g<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("gg_{0}")]] [[kernel]] void copy_gg<{1}, {2}>(
+    device const {1}* src [[buffer(0)]],
+    device {2}* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]]);
+)";
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -0,0 +1,34 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::metal {
+
+const char* utils();
+const char* binary_ops();
+const char* unary_ops();
+const char* ternary_ops();
+const char* reduce_utils();
+const char* gather();
+const char* scatter();
+
+const char* arange();
+const char* unary();
+const char* binary();
+const char* binary_two();
+const char* copy();
+const char* ternary();
+const char* scan();
+const char* softmax();
+const char* sort();
+const char* reduce();
+
+const char* gemm();
+const char* steel_gemm_fused();
+const char* steel_gemm_masked();
+const char* steel_gemm_splitk();
+const char* conv();
+const char* steel_conv();
+const char* steel_conv_general();
+
+} // namespace mlx::core::metal
--- a/mlx/backend/metal/jit/indexing.h
+++ b/mlx/backend/metal/jit/indexing.h
@@ -0,0 +1,81 @@
+// Copyright © 2023-2024 Apple Inc.
+
+constexpr std::string_view gather_kernels = R"(
+[[kernel]] void gather{0}_{3}_{6}(
+    const device {1}* src [[buffer(0)]],
+    device {1}* out [[buffer(1)]],
+    const constant int* src_shape [[buffer(2)]],
+    const constant size_t* src_strides [[buffer(3)]],
+    const constant size_t& src_ndim [[buffer(4)]],
+    const constant int* slice_sizes [[buffer(5)]],
+    const constant int* axes [[buffer(6)]],
+    const constant int* idx_shapes [[buffer(7)]],
+    const constant size_t* idx_strides [[buffer(8)]],
+    const constant int& idx_ndim [[buffer(9)]],
+    {4}
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {{
+  Indices<{2}, {3}> idxs{{
+    {{ {5} }}, idx_shapes, idx_strides, idx_ndim}};
+
+  return gather_impl<{1}, {2}, {3}, {6}>(
+      src,
+      out,
+      src_shape,
+      src_strides,
+      src_ndim,
+      slice_sizes,
+      axes,
+      idxs,
+      index,
+      grid_dim);
+}}
+)";
+
+constexpr std::string_view scatter_kernels = R"(
+[[kernel]] void scatter_1d_index{0}_{4}(
+    const device {1}* updates [[buffer(1)]],
+    device mlx_atomic<{1}>* out [[buffer(2)]],
+    const constant int* out_shape [[buffer(3)]],
+    const constant size_t* out_strides [[buffer(4)]],
+    const constant size_t& upd_size [[buffer(5)]],
+    {5}
+    uint2 gid [[thread_position_in_grid]]) {{
+  const array<const device {2}*, {4}> idx_buffers = {{ {6} }};
+  return scatter_1d_index_impl<{1}, {2}, {3}, {4}>(
+      updates, out, out_shape, out_strides, upd_size, idx_buffers, gid);
+}}
+
+[[kernel]] void scatter{0}_{4}(
+    const device {1}* updates [[buffer(1)]],
+    device mlx_atomic<{1}>* out [[buffer(2)]],
+    const constant int* upd_shape [[buffer(3)]],
+    const constant size_t* upd_strides [[buffer(4)]],
+    const constant size_t& upd_ndim [[buffer(5)]],
+    const constant size_t& upd_size [[buffer(6)]],
+    const constant int* out_shape [[buffer(7)]],
+    const constant size_t* out_strides [[buffer(8)]],
+    const constant size_t& out_ndim [[buffer(9)]],
+    const constant int* axes [[buffer(10)]],
+    const constant int* idx_shapes [[buffer(11)]],
+    const constant size_t* idx_strides [[buffer(12)]],
+    const constant int& idx_ndim [[buffer(13)]],
+    {5}
+    uint2 gid [[thread_position_in_grid]]) {{
+  Indices<{2}, {4}> idxs{{ {{ {6} }}, idx_shapes, idx_strides, idx_ndim}};
+
+  return scatter_impl<{1}, {2}, {3}, {4}>(
+      updates,
+      out,
+      upd_shape,
+      upd_strides,
+      upd_ndim,
+      upd_size,
+      out_shape,
+      out_strides,
+      out_ndim,
+      axes,
+      idxs,
+      gid);
+}}
+)";
--- a/mlx/backend/metal/jit/reduce.h
+++ b/mlx/backend/metal/jit/reduce.h
@@ -0,0 +1,168 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view reduce_init_kernels = R"(
+[[kernel]] void {0}(
+    device {1}* out [[buffer(0)]],
+    uint tid [[thread_position_in_grid]]) {{
+  out[tid] = {2}<{1}>::init;
+}}
+)";
+
+constexpr std::string_view reduce_kernels = R"(
+template [[host_name("all_{0}")]] [[kernel]] void
+all_reduce<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const device size_t& in_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint grid_size [[threads_per_grid]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("colGeneral_{0}")]] [[kernel]] void
+col_reduce_general<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    threadgroup {2}* local_data [[threadgroup(0)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]]);
+template [[host_name("colSmall_{0}")]] [[kernel]] void
+col_reduce_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    const constant size_t& non_col_reductions [[buffer(8)]],
+    const constant int* non_col_shapes [[buffer(9)]],
+    const constant size_t* non_col_strides [[buffer(10)]],
+    const constant int& non_col_ndim [[buffer(11)]],
+    uint tid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralSmall_{0}")]] [[kernel]] void
+row_reduce_general_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint lid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralMed_{0}")]] [[kernel]] void
+row_reduce_general_med<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[dispatch_simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("rowGeneral_{0}")]] [[kernel]] void
+row_reduce_general<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
+
+constexpr std::string_view reduce_non_atomic_kernels = R"(
+template [[host_name("allNoAtomics_{0}")]] [[kernel]] void
+all_reduce_no_atomics<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const device size_t& in_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint grid_size [[threads_per_grid]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint thread_group_id [[threadgroup_position_in_grid]]);
+
+template [[host_name("colGeneralNoAtomics_{0}")]] [[kernel]] void
+  col_reduce_general_no_atomics<{1}, {2}, {3}<{2}>>(
+      const device {1}* in [[buffer(0)]],
+      device {2}* out [[buffer(1)]],
+      const constant size_t& reduction_size [[buffer(2)]],
+      const constant size_t& reduction_stride [[buffer(3)]],
+      const constant size_t& out_size [[buffer(4)]],
+      const constant int* shape [[buffer(5)]],
+      const constant size_t* strides [[buffer(6)]],
+      const constant int& ndim [[buffer(7)]],
+      threadgroup {2}* local_data [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint3 gid [[thread_position_in_grid]],
+      uint3 lsize [[threads_per_threadgroup]],
+      uint3 gsize [[threads_per_grid]]);
+template [[host_name("colSmall_{0}")]] [[kernel]] void
+col_reduce_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    const constant size_t& non_col_reductions [[buffer(8)]],
+    const constant int* non_col_shapes [[buffer(9)]],
+    const constant size_t* non_col_strides [[buffer(10)]],
+    const constant int& non_col_ndim [[buffer(11)]],
+    uint tid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralSmall_{0}")]] [[kernel]] void
+row_reduce_general_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint lid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralNoAtomics_{0}")]] [[kernel]] void
+row_reduce_general_no_atomics<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 gsize [[threads_per_grid]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
--- a/mlx/backend/metal/jit/scan.h
+++ b/mlx/backend/metal/jit/scan.h
@@ -0,0 +1,26 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view scan_kernels = R"(
+template [[host_name("contig_{0}")]] [[kernel]] void
+contiguous_scan<{1}, {2}, {3}<{2}>, 4, {4}, {5}>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("strided_{0}")]] [[kernel]] void
+strided_scan<{1}, {2}, {3}<{2}>, 4, {4}, {5}>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    const constant size_t& stride [[buffer(3)]],
+    uint2 gid [[thread_position_in_grid]],
+    uint2 lid [[thread_position_in_threadgroup]],
+    uint2 lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]]);
+)";
--- a/mlx/backend/metal/jit/softmax.h
+++ b/mlx/backend/metal/jit/softmax.h
@@ -0,0 +1,23 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view softmax_kernels = R"(
+template [[host_name("block_{0}")]] [[kernel]] void
+softmax_single_row<{1}, {2}>(
+    const device {1}* in,
+    device {1}* out,
+    constant int& axis_size,
+    uint gid [[thread_position_in_grid]],
+    uint _lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("looped_{0}")]] [[kernel]] void
+softmax_looped<{1}, {2}>(
+    const device {1}* in,
+    device {1}* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
--- a/mlx/backend/metal/jit/sort.h
+++ b/mlx/backend/metal/jit/sort.h
@@ -0,0 +1,81 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view block_sort_kernels = R"(
+template [[host_name("carg_{0}")]] [[kernel]] void
+block_sort<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& stride_segment_axis [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("ncarg_{0}")]] [[kernel]] void
+block_sort_nc<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& nc_dim [[buffer(4)]],
+    const device int* nc_shape [[buffer(5)]],
+    const device size_t* nc_strides [[buffer(6)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("c_{0}")]] [[kernel]] void
+block_sort<{1}, {2}, false, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& stride_segment_axis [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("nc_{0}")]] [[kernel]] void
+block_sort_nc<{1}, {2}, false, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& nc_dim [[buffer(4)]],
+    const device int* nc_shape [[buffer(5)]],
+    const device size_t* nc_strides [[buffer(6)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
+
+constexpr std::string_view multiblock_sort_kernels = R"(
+template [[host_name("sort_{0}")]] [[kernel]] void
+mb_block_sort<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {1}* out_vals [[buffer(1)]],
+    device {2}* out_idxs [[buffer(2)]],
+    const constant int& size_sorted_axis [[buffer(3)]],
+    const constant int& stride_sorted_axis [[buffer(4)]],
+    const constant int& nc_dim [[buffer(5)]],
+    const device int* nc_shape [[buffer(6)]],
+    const device size_t* nc_strides [[buffer(7)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("partition_{0}")]] [[kernel]] void
+mb_block_partition<{1}, {2}, true, {3}, {4}>(
+    device {2}* block_partitions [[buffer(0)]],
+    const device {1}* dev_vals [[buffer(1)]],
+    const device {2}* dev_idxs [[buffer(2)]],
+    const constant int& size_sorted_axis [[buffer(3)]],
+    const constant int& merge_tiles [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 tgp_dims [[threads_per_threadgroup]]);
+template [[host_name("merge_{0}")]] [[kernel]] void
+mb_block_merge<{1}, {2}, true, {3}, {4}>(
+    const device {2}* block_partitions [[buffer(0)]],
+    const device {1}* dev_vals_in [[buffer(1)]],
+    const device {2}* dev_idxs_in [[buffer(2)]],
+    device {1}* dev_vals_out [[buffer(3)]],
+    device {2}* dev_idxs_out [[buffer(4)]],
+    const constant int& size_sorted_axis [[buffer(5)]],
+    const constant int& merge_tiles [[buffer(6)]],
+    const constant int& num_tiles [[buffer(7)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
--- a/mlx/backend/metal/jit/steel_conv.h
+++ b/mlx/backend/metal/jit/steel_conv.h
@@ -0,0 +1,32 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view steel_conv_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+implicit_gemm_conv_2d<{itype}, {bm}, {bn}, {bk}, {wm}, {wn}, {n_channels}, {small_filter}>(
+    const device {itype}* A [[buffer(0)]],
+    const device {itype}* B [[buffer(1)]],
+    device {itype}* C [[buffer(2)]],
+    const constant MLXConvParams<2>* params [[buffer(3)]],
+    const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]);
+)";
+
+constexpr std::string_view steel_conv_general_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+    implicit_gemm_conv_2d_general<{itype}, {bm}, {bn}, {bk}, {wm}, {wn}>(
+        const device {itype}* A [[buffer(0)]],
+        const device {itype}* B [[buffer(1)]],
+        device {itype}* C [[buffer(2)]],
+        const constant MLXConvParams<2>* params [[buffer(3)]],
+        const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
+        const constant Conv2DGeneralJumpParams* jump_params [[buffer(5)]],
+        const constant Conv2DGeneralBaseInfo* base_h [[buffer(6)]],
+        const constant Conv2DGeneralBaseInfo* base_w [[buffer(7)]],
+        uint3 tid [[threadgroup_position_in_grid]],
+        uint3 lid [[thread_position_in_threadgroup]],
+        uint simd_gid [[simdgroup_index_in_threadgroup]],
+        uint simd_lid [[thread_index_in_simdgroup]]);
+)";
--- a/mlx/backend/metal/jit/steel_gemm.h
+++ b/mlx/backend/metal/jit/steel_gemm.h
@@ -0,0 +1,106 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view steel_gemm_fused_kernels = R"(
+template [[host_name("{name}")]]
+[[kernel]] void gemm<{itype}, {bm}, {bn}, {bk}, {wm}, {wn}, {trans_a}, {trans_b}, float>(
+    const device {itype} *A [[buffer(0)]],
+    const device {itype} *B [[buffer(1)]],
+    const device {itype} *C [[buffer(2), function_constant(use_out_source)]],
+    device {itype} *D [[buffer(3)]],
+    const constant GEMMParams* params [[buffer(4)]],
+    const constant GEMMAddMMParams* addmm_params [[buffer(5), function_constant(use_out_source)]],
+    const constant int* batch_shape [[buffer(6)]],
+    const constant size_t* batch_strides [[buffer(7)]],
+    const constant uint32_t* lhs_indices [[buffer(10), function_constant(do_gather)]],
+    const constant uint32_t* rhs_indices [[buffer(11), function_constant(do_gather)]],
+    const constant uint32_t* C_indices [[buffer(12), function_constant(gather_bias)]],
+    const constant int* operand_shape [[buffer(13), function_constant(do_gather)]],
+    const constant size_t* operand_strides [[buffer(14), function_constant(do_gather)]],
+    const constant packed_int3& operand_batch_ndim [[buffer(15), function_constant(do_gather)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
+
+constexpr std::string_view steel_gemm_masked_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+block_masked_gemm<
+    {itype},
+    {outmasktype},
+    {opmasktype},
+    {bm},
+    {bn},
+    {bk},
+    {wm},
+    {wn},
+    {trans_a},
+    {trans_b},
+    {mn_aligned},
+    {k_aligned}>(
+    const device {itype}* A [[buffer(0)]],
+    const device {itype}* B [[buffer(1)]],
+    device {itype}* D [[buffer(3)]],
+    const constant GEMMParams* params [[buffer(4)]],
+    const constant int* batch_shape [[buffer(6)]],
+    const constant size_t* batch_strides [[buffer(7)]],
+    const device {outmasktype}* out_mask [[buffer(10)]],
+    const device {opmasktype}* lhs_mask [[buffer(11)]],
+    const device {opmasktype}* rhs_mask [[buffer(12)]],
+    const constant int* mask_strides [[buffer(13)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
+
+constexpr std::string_view steel_gemm_splitk_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+gemm_splitk<
+    {itype},
+    {otype},
+    {bm},
+    {bn},
+    {bk},
+    {wm},
+    {wn},
+    {trans_a},
+    {trans_b},
+    {mn_aligned},
+    {k_aligned}>(
+    const device {itype}* A [[buffer(0)]],
+    const device {itype}* B [[buffer(1)]],
+    device {otype}* C [[buffer(2)]],
+    const constant GEMMSpiltKParams* params [[buffer(3)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
+
+constexpr std::string_view steel_gemm_splitk_accum_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+gemm_splitk_accum<{atype}, {otype}>(
+    const device {atype}* C_split [[buffer(0)]],
+    device {otype}* D [[buffer(1)]],
+    const constant int& k_partitions [[buffer(2)]],
+    const constant int& partition_stride [[buffer(3)]],
+    const constant int& ldd [[buffer(4)]],
+    uint2 gid [[thread_position_in_grid]]);
+)";
+
+constexpr std::string_view steel_gemm_splitk_accum_axbpy_kernels = R"(
+template [[host_name("{name}")]] [[kernel]] void
+gemm_splitk_accum_axpby<{atype}, {otype}>(
+    const device {atype}* C_split [[buffer(0)]],
+    device {otype}* D [[buffer(1)]],
+    const constant int& k_partitions [[buffer(2)]],
+    const constant int& partition_stride [[buffer(3)]],
+    const constant int& ldd [[buffer(4)]],
+    const device {otype}* C [[buffer(5)]],
+    const constant int& ldc [[buffer(6)]],
+    const constant int& fdc [[buffer(7)]],
+    const constant float& alpha [[buffer(8)]],
+    const constant float& beta [[buffer(9)]],
+    uint2 gid [[thread_position_in_grid]]);
+)";
--- a/mlx/backend/metal/jit/ternary.h
+++ b/mlx/backend/metal/jit/ternary.h
@@ -0,0 +1,80 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view ternary_kernels = R"(
+template [[host_name("v_{0}")]] [[kernel]] void ternary_v<{1}, {2}>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    uint index [[thread_position_in_grid]]);
+
+template [[host_name("g_{0}")]] [[kernel]] void ternary_g<{1}, {2}>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const size_t* c_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+
+template [[host_name("g1_{0}")]] [[kernel]] void
+ternary_g_nd1<{1}, {2}>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const size_t& a_strides,
+    constant const size_t& b_strides,
+    constant const size_t& c_strides,
+    uint index [[thread_position_in_grid]]);
+template [[host_name("g2_{0}")]] [[kernel]] void
+ternary_g_nd2<{1}, {2}>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    constant const size_t c_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]);
+template [[host_name("g3_{0}")]] [[kernel]] void
+ternary_g_nd3<{1}, {2}>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    constant const size_t c_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("g4_{0}")]] [[kernel]] void
+ternary_g_nd<{1}, {2}, 4>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const int shape[4],
+    constant const size_t a_strides[4],
+    constant const size_t b_strides[4],
+    constant const size_t c_strides[4],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+template [[host_name("g5_{0}")]] [[kernel]] void
+ternary_g_nd<{1}, {2}, 5>(
+    device const bool* a,
+    device const {1}* b,
+    device const {1}* c,
+    device {1}* d,
+    constant const int shape[5],
+    constant const size_t a_strides[5],
+    constant const size_t b_strides[5],
+    constant const size_t c_strides[5],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]);
+)";
--- a/mlx/backend/metal/jit/unary.h
+++ b/mlx/backend/metal/jit/unary.h
@@ -0,0 +1,16 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view unary_kernels = R"(
+template [[host_name("v{0}")]] [[kernel]] void unary_v<{1}, {2}>(
+    device const {1}* in,
+    device {1}* out,
+    uint index [[thread_position_in_grid]]);
+
+template [[host_name("g{0}")]] [[kernel]] void unary_g<{1}, {2}>(
+    device const {1}* in,
+    device {1}* out,
+    device const int* in_shape,
+    device const size_t* in_strides,
+    device const int& ndim,
+    uint index [[thread_position_in_grid]]);
+)";
--- a/mlx/backend/metal/jit_kernels.cpp
+++ b/mlx/backend/metal/jit_kernels.cpp
@@ -0,0 +1,486 @@
+// Copyright © 2024 Apple Inc.
+#include <fmt/format.h>
+
+#include "mlx/backend/common/compiled.h"
+#include "mlx/backend/metal/jit/arange.h"
+#include "mlx/backend/metal/jit/binary.h"
+#include "mlx/backend/metal/jit/binary_two.h"
+#include "mlx/backend/metal/jit/copy.h"
+#include "mlx/backend/metal/jit/includes.h"
+#include "mlx/backend/metal/jit/reduce.h"
+#include "mlx/backend/metal/jit/scan.h"
+#include "mlx/backend/metal/jit/softmax.h"
+#include "mlx/backend/metal/jit/sort.h"
+#include "mlx/backend/metal/jit/steel_conv.h"
+#include "mlx/backend/metal/jit/steel_gemm.h"
+#include "mlx/backend/metal/jit/ternary.h"
+#include "mlx/backend/metal/jit/unary.h"
+#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/utils.h"
+
+using namespace fmt::literals;
+
+namespace mlx::core {
+
+std::string op_name(const array& arr) {
+  std::ostringstream op_t;
+  arr.primitive().print(op_t);
+  return op_t.str();
+}
+
+MTL::ComputePipelineState* get_arange_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source
+        << metal::utils() << metal::arange()
+        << fmt::format(arange_kernels, lib_name, get_type_string(out.dtype()));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_unary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::unary_ops() << metal::unary()
+                  << fmt::format(
+                         unary_kernels,
+                         lib_name,
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_binary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(2);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::binary_ops() << metal::binary()
+                  << fmt::format(
+                         binary_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_binary_two_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(2);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::binary_ops()
+                  << metal::binary_two()
+                  << fmt::format(
+                         binary_two_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_ternary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::ternary_ops() << metal::ternary()
+                  << fmt::format(
+                         ternary_kernels,
+                         lib_name,
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_copy_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::copy()
+                  << fmt::format(
+                         copy_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_softmax_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    bool precise,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::softmax()
+                  << fmt::format(
+                         softmax_kernels,
+                         lib_name,
+                         get_type_string(out.dtype()),
+                         get_type_string(precise ? float32 : out.dtype()));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_scan_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    bool reverse,
+    bool inclusive,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::scan()
+                  << fmt::format(
+                         scan_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()),
+                         op_name(out),
+                         inclusive,
+                         reverse);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_sort_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    int bn,
+    int tn) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::sort()
+                  << fmt::format(
+                         block_sort_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()),
+                         bn,
+                         tn);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_mb_sort_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& idx,
+    int bn,
+    int tn) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::sort()
+                  << fmt::format(
+                         multiblock_sort_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(idx.dtype()),
+                         bn,
+                         tn);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_reduce_init_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out) {
+  auto lib = d.get_library(kernel_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::reduce_utils()
+                  << fmt::format(
+                         reduce_init_kernels,
+                         kernel_name,
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(kernel_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_reduce_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    bool non_atomic = out.dtype() == int64 || out.dtype() == uint64;
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::reduce_utils() << metal::reduce()
+                  << fmt::format(
+                         non_atomic ? reduce_non_atomic_kernels
+                                    : reduce_kernels,
+                         lib_name,
+                         get_type_string(in.dtype()),
+                         get_type_string(out.dtype()),
+                         op_name(out));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_steel_gemm_fused_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::gemm()
+                  << metal::steel_gemm_fused()
+                  << fmt::format(
+                         steel_gemm_fused_kernels,
+                         "name"_a = lib_name,
+                         "itype"_a = get_type_string(out.dtype()),
+                         "bm"_a = bm,
+                         "bn"_a = bn,
+                         "bk"_a = bk,
+                         "wm"_a = wm,
+                         "wn"_a = wn,
+                         "trans_a"_a = transpose_a,
+                         "trans_b"_a = transpose_b);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
+}
+
+MTL::ComputePipelineState* get_steel_gemm_splitk_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    bool mn_aligned,
+    bool k_aligned) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::gemm()
+                  << metal::steel_gemm_splitk()
+                  << fmt::format(
+                         steel_gemm_splitk_kernels,
+                         "name"_a = lib_name,
+                         "itype"_a = get_type_string(in.dtype()),
+                         "otype"_a = get_type_string(out.dtype()),
+                         "bm"_a = bm,
+                         "bn"_a = bn,
+                         "bk"_a = bk,
+                         "wm"_a = wm,
+                         "wn"_a = wn,
+                         "trans_a"_a = transpose_a,
+                         "trans_b"_a = transpose_b,
+                         "mn_aligned"_a = mn_aligned,
+                         "k_aligned"_a = k_aligned);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_steel_gemm_splitk_accum_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    bool axbpy) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::gemm()
+                  << metal::steel_gemm_splitk()
+                  << fmt::format(
+                         axbpy ? steel_gemm_splitk_accum_axbpy_kernels
+                               : steel_gemm_splitk_accum_kernels,
+                         "name"_a = lib_name,
+                         "atype"_a = get_type_string(in.dtype()),
+                         "otype"_a = get_type_string(out.dtype()));
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_steel_gemm_masked_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    const std::optional<array>& mask_out,
+    const std::optional<array>& mask_op,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    bool mn_aligned,
+    bool k_aligned) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    auto out_mask_type = mask_out.has_value()
+        ? get_type_string((*mask_out).dtype())
+        : "nomask_t";
+    auto op_mask_type =
+        mask_op.has_value() ? get_type_string((*mask_op).dtype()) : "nomask_t";
+    kernel_source << metal::utils() << metal::gemm()
+                  << metal::steel_gemm_masked()
+                  << fmt::format(
+                         steel_gemm_masked_kernels,
+                         "name"_a = lib_name,
+                         "itype"_a = get_type_string(out.dtype()),
+                         "outmasktype"_a = out_mask_type,
+                         "opmasktype"_a = op_mask_type,
+                         "bm"_a = bm,
+                         "bn"_a = bn,
+                         "bk"_a = bk,
+                         "wm"_a = wm,
+                         "wn"_a = wn,
+                         "trans_a"_a = transpose_a,
+                         "trans_b"_a = transpose_b,
+                         "mn_aligned"_a = mn_aligned,
+                         "k_aligned"_a = k_aligned);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_steel_conv_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    int n_channel_specialization,
+    bool small_filter) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::conv() << metal::steel_conv()
+                  << fmt::format(
+                         steel_conv_kernels,
+                         "name"_a = lib_name,
+                         "itype"_a = get_type_string(out.dtype()),
+                         "bm"_a = bm,
+                         "bn"_a = bn,
+                         "bk"_a = bk,
+                         "wm"_a = wm,
+                         "wn"_a = wn,
+                         "n_channels"_a = n_channel_specialization,
+                         "small_filter"_a = small_filter);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+MTL::ComputePipelineState* get_steel_conv_general_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn) {
+  const auto& lib_name = kernel_name;
+  auto lib = d.get_library(lib_name);
+  if (lib == nullptr) {
+    std::ostringstream kernel_source;
+    kernel_source << metal::utils() << metal::conv()
+                  << metal::steel_conv_general()
+                  << fmt::format(
+                         steel_conv_general_kernels,
+                         "name"_a = lib_name,
+                         "itype"_a = get_type_string(out.dtype()),
+                         "bm"_a = bm,
+                         "bn"_a = bn,
+                         "bk"_a = bk,
+                         "wm"_a = wm,
+                         "wn"_a = wn);
+    lib = d.get_library(lib_name, kernel_source.str());
+  }
+  return d.get_kernel(kernel_name, lib);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/kernels.h
+++ b/mlx/backend/metal/kernels.h
@@ -0,0 +1,156 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/array.h"
+#include "mlx/backend/metal/device.h"
+
+namespace mlx::core {
+
+MTL::ComputePipelineState* get_arange_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out);
+
+MTL::ComputePipelineState* get_unary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out);
+
+MTL::ComputePipelineState* get_binary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out);
+
+MTL::ComputePipelineState* get_binary_two_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out);
+
+MTL::ComputePipelineState* get_ternary_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out);
+
+MTL::ComputePipelineState* get_copy_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out);
+
+MTL::ComputePipelineState* get_softmax_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    bool precise,
+    const array& out);
+
+MTL::ComputePipelineState* get_scan_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    bool reverse,
+    bool inclusive,
+    const array& in,
+    const array& out);
+
+MTL::ComputePipelineState* get_sort_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    int bn,
+    int tn);
+
+MTL::ComputePipelineState* get_mb_sort_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& idx,
+    int bn,
+    int tn);
+
+MTL::ComputePipelineState* get_reduce_init_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out);
+
+MTL::ComputePipelineState* get_reduce_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out);
+
+MTL::ComputePipelineState* get_steel_gemm_fused_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const std::string& hash_name,
+    const metal::MTLFCList& func_consts,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn);
+
+MTL::ComputePipelineState* get_steel_gemm_splitk_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    bool mn_aligned,
+    bool k_aligned);
+
+MTL::ComputePipelineState* get_steel_gemm_splitk_accum_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out,
+    bool axbpy);
+
+MTL::ComputePipelineState* get_steel_gemm_masked_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    const std::optional<array>& mask_out,
+    const std::optional<array>& mask_op,
+    bool transpose_a,
+    bool transpose_b,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    bool mn_aligned,
+    bool k_aligned);
+
+MTL::ComputePipelineState* get_steel_conv_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn,
+    int n_channel_specialization,
+    bool small_filter);
+
+MTL::ComputePipelineState* get_steel_conv_general_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& out,
+    int bm,
+    int bn,
+    int bk,
+    int wm,
+    int wn);
+
+} // namespace mlx::core
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -1,26 +1,17 @@
 set(
  HEADERS
-  ${CMAKE_CURRENT_SOURCE_DIR}/atomic.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/bf16.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/bf16_math.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/binary.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/complex.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/defines.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/erf.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/expm1f.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/unary.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+  bf16.h
+  bf16_math.h
+  complex.h
+  defines.h
+  utils.h
+  steel/conv/params.h
 )

 set(
  KERNELS
-  "arange"
  "arg_reduce"
-  "binary"
-  "binary_two"
  "conv"
-  "copy"
  "fft"
  "gemv"
  "quantized"
@@ -28,18 +19,48 @@ set(
  "rms_norm"
  "layer_norm"
  "rope"
-  "scan"
  "scaled_dot_product_attention"
-  "softmax"
-  "sort"
-  "ternary"
-  "unary"
-  "gather"
-  "scatter"
 )

+if (NOT MLX_METAL_JIT)
+set(
+  KERNELS
+  ${KERNELS}
+  "arange"
+  "binary"
+  "binary_two"
+  "unary"
+  "ternary"
+  "copy"
+  "softmax"
+  "sort"
+  "scan"
+  "reduce"
+)
+set(
+  HEADERS 
+  ${HEADERS}
+  atomic.h
+  arange.h
+  unary_ops.h
+  unary.h
+  binary_ops.h
+  binary.h
+  ternary.h
+  copy.h
+  softmax.h
+  sort.h
+  scan.h
+  reduction/ops.h
+  reduction/reduce_init.h
+  reduction/reduce_all.h
+  reduction/reduce_col.h
+  reduction/reduce_row.h
+)
+endif()
+
 function(build_kernel_base TARGET SRCFILE DEPS)
-  set(METAL_FLAGS -Wall -Wextra -fno-fast-math)
+  set(METAL_FLAGS -Wall -Wextra -fno-fast-math -D${MLX_METAL_VERSION})
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS}
        -gline-tables-only
@@ -68,23 +89,40 @@ foreach(KERNEL ${KERNELS})
  set(KERNEL_AIR ${KERNEL}.air ${KERNEL_AIR})
 endforeach()

-file(GLOB_RECURSE STEEL_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.metal)
-file(GLOB_RECURSE STEEL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.h)
-
-foreach(KERNEL ${STEEL_KERNELS})
-  cmake_path(GET KERNEL STEM TARGET)
-  build_kernel_base(${TARGET} ${KERNEL} "${STEEL_HEADERS}")
-  set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
-endforeach()
-
-file(GLOB_RECURSE REDUCE_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/reduction/*.metal)
-file(GLOB_RECURSE REDUCE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/reduction/*.h)
-
-foreach(KERNEL ${REDUCE_KERNELS})
-  cmake_path(GET KERNEL STEM TARGET)
-  build_kernel_base(${TARGET} ${KERNEL} "${REDUCE_HEADERS}")
-  set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
-endforeach()
+if (NOT MLX_METAL_JIT)
+  set(
+    STEEL_KERNELS
+    ${CMAKE_CURRENT_SOURCE_DIR}/steel/conv/kernels/steel_conv.metal
+    ${CMAKE_CURRENT_SOURCE_DIR}/steel/conv/kernels/steel_conv_general.metal
+    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_fused.metal
+    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_masked.metal
+    ${CMAKE_CURRENT_SOURCE_DIR}/steel/gemm/kernels/steel_gemm_splitk.metal
+  )
+  set(
+    STEEL_HEADERS 
+    steel/defines.h
+    steel/utils.h
+    steel/conv/conv.h
+    steel/conv/loader.h
+    steel/conv/loaders/loader_channel_l.h
+    steel/conv/loaders/loader_channel_n.h
+    steel/conv/loaders/loader_general.h
+    steel/conv/kernels/steel_conv.h
+    steel/conv/kernels/steel_conv_general.h
+    steel/gemm/gemm.h
+    steel/gemm/mma.h
+    steel/gemm/loader.h
+    steel/gemm/transforms.h
+    steel/gemm/kernels/steel_gemm_fused.h
+    steel/gemm/kernels/steel_gemm_masked.h
+    steel/gemm/kernels/steel_gemm_splitk.h
+  )
+  foreach(KERNEL ${STEEL_KERNELS})
+    cmake_path(GET KERNEL STEM TARGET)
+    build_kernel_base(${TARGET} ${KERNEL} "${STEEL_HEADERS}")
+    set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
+  endforeach()
+endif()

 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
--- a/mlx/backend/metal/kernels/arange.h
+++ b/mlx/backend/metal/kernels/arange.h
@@ -0,0 +1,9 @@
+// Copyright © 2023-2024 Apple Inc.
+template <typename T>
+[[kernel]] void arange(
+    constant const T& start,
+    constant const T& step,
+    device T* out,
+    uint index [[thread_position_in_grid]]) {
+  out[index] = start + index * step;
+}
--- a/mlx/backend/metal/kernels/arange.metal
+++ b/mlx/backend/metal/kernels/arange.metal
@@ -1,15 +1,8 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

+// clang-format off
 #include "mlx/backend/metal/kernels/bf16.h"
-
-template <typename T>
-[[kernel]] void arange(
-    constant const T& start,
-    constant const T& step,
-    device T* out,
-    uint index [[thread_position_in_grid]]) {
-  out[index] = start + index * step;
-}
+#include "mlx/backend/metal/kernels/arange.h"

 #define instantiate_arange(tname, type)                                 \
  template [[host_name("arange" #tname)]] [[kernel]] void arange<type>( \
@@ -18,15 +11,14 @@ template <typename T>
      device type* out,                                                 \
      uint index [[thread_position_in_grid]]);

-// clang-format off
-instantiate_arange(uint8, uint8_t) 
+instantiate_arange(uint8, uint8_t)
 instantiate_arange(uint16, uint16_t)
-instantiate_arange(uint32, uint32_t) 
+instantiate_arange(uint32, uint32_t)
 instantiate_arange(uint64, uint64_t)
-instantiate_arange(int8, int8_t) 
+instantiate_arange(int8, int8_t)
 instantiate_arange(int16, int16_t)
 instantiate_arange(int32, int32_t)
 instantiate_arange(int64, int64_t)
 instantiate_arange(float16, half)
 instantiate_arange(float32, float)
-instantiate_arange(bfloat16, bfloat16_t) // clang-format on
+instantiate_arange(bfloat16, bfloat16_t) // clang-format on
--- a/mlx/backend/metal/kernels/arg_reduce.metal
+++ b/mlx/backend/metal/kernels/arg_reduce.metal
@@ -1,6 +1,5 @@
 // Copyright © 2023 Apple Inc.

-#include <metal_atomic>
 #include <metal_simdgroup>

 #include "mlx/backend/metal/kernels/utils.h"
@@ -194,4 +193,4 @@ instantiate_arg_reduce(int32, int32_t)
 instantiate_arg_reduce(int64, int64_t)
 instantiate_arg_reduce(float16, half)
 instantiate_arg_reduce(float32, float)
-instantiate_arg_reduce(bfloat16, bfloat16_t) // clang-format on
+instantiate_arg_reduce(bfloat16, bfloat16_t) // clang-format on
--- a/mlx/backend/metal/kernels/atomic.h
+++ b/mlx/backend/metal/kernels/atomic.h
@@ -4,7 +4,6 @@

 #include <metal_atomic>
 #include <metal_stdlib>
-#include "mlx/backend/metal/kernels/bf16.h"

 using namespace metal;

--- a/mlx/backend/metal/kernels/bf16.h
+++ b/mlx/backend/metal/kernels/bf16.h
@@ -6,7 +6,7 @@

 using namespace metal;

-#if defined(__HAVE_BFLOAT__)
+#if defined METAL_3_1 || (__METAL_VERSION__ >= 310)

 typedef bfloat bfloat16_t;

@@ -312,6 +312,6 @@ METAL_FUNC bool isnan(_MLX_BFloat16 x) {

 #pragma METAL internals : disable

-#endif // defined(__HAVE_BFLOAT__)
+#endif

 #include "mlx/backend/metal/kernels/bf16_math.h"
--- a/mlx/backend/metal/kernels/bf16_math.h
+++ b/mlx/backend/metal/kernels/bf16_math.h
@@ -369,7 +369,7 @@ instantiate_metal_math_funcs(
    return static_cast<otype>(__metal_simd_xor(static_cast<ctype>(data)));     \
  }

-#if defined(__HAVE_BFLOAT__)
+#if defined METAL_3_1 || (__METAL_VERSION__ >= 310)

 #define bfloat16_to_uint16(x) as_type<uint16_t>(x)
 #define uint16_to_bfloat16(x) as_type<bfloat16_t>(x)
@@ -391,4 +391,4 @@ instantiate_metal_simd_comm_funcs(
    uint16_to_bfloat16);
 instantiate_metal_simd_reduction_funcs(bfloat16_t, bfloat16_t, float);

-} // namespace metal
+} // namespace metal
--- a/mlx/backend/metal/kernels/binary.h
+++ b/mlx/backend/metal/kernels/binary.h
@@ -1,266 +1,113 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2024 Apple Inc.

-#pragma once
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_ss(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op()(a[0], b[0]);
+}

-#include <metal_integer>
-#include <metal_math>
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_sv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op()(a[0], b[index]);
+}

-#include "mlx/backend/metal/kernels/bf16.h"
-#include "mlx/backend/metal/kernels/utils.h"
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_vs(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op()(a[index], b[0]);
+}

-struct Add {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x + y;
-  }
-};
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_vv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op()(a[index], b[index]);
+}

-struct Divide {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x / y;
-  }
-};
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd1(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]) {
+  auto a_idx = elem_to_loc_1(index, a_stride);
+  auto b_idx = elem_to_loc_1(index, b_stride);
+  c[index] = Op()(a[a_idx], b[b_idx]);
+}

-struct Remainder {
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T> & !metal::is_signed_v<T>, T>
-  operator()(T x, T y) {
-    return x % y;
-  }
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T> & metal::is_signed_v<T>, T>
-  operator()(T x, T y) {
-    auto r = x % y;
-    if (r != 0 && (r < 0 != y < 0)) {
-      r += y;
-    }
-    return r;
-  }
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
-    T r = fmod(x, y);
-    if (r != 0 && (r < 0 != y < 0)) {
-      r += y;
-    }
-    return r;
-  }
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x % y;
-  }
-};
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd2(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_2(index, a_strides);
+  auto b_idx = elem_to_loc_2(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
+  c[out_idx] = Op()(a[a_idx], b[b_idx]);
+}

-struct Equal {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x == y;
-  }
-};
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd3(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_3(index, a_strides);
+  auto b_idx = elem_to_loc_3(index, b_strides);
+  size_t out_idx =
+      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op()(a[a_idx], b[b_idx]);
+}

-struct NaNEqual {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x == y || (metal::isnan(x) && metal::isnan(y));
-  }
-  template <>
-  bool operator()(complex64_t x, complex64_t y) {
-    return x == y ||
-        (metal::isnan(x.real) && metal::isnan(y.real) && metal::isnan(x.imag) &&
-         metal::isnan(y.imag)) ||
-        (x.real == y.real && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
-        (metal::isnan(x.real) && metal::isnan(y.real) && x.imag == y.imag);
-  }
-};
+template <typename T, typename U, typename Op, int DIM>
+[[kernel]] void binary_g_nd(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    constant const int shape[DIM],
+    constant const size_t a_strides[DIM],
+    constant const size_t b_strides[DIM],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
+  size_t out_idx =
+      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op()(a[idx.x], b[idx.y]);
+}

-struct Greater {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x > y;
-  }
-};
-
-struct GreaterEqual {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x >= y;
-  }
-};
-
-struct Less {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x < y;
-  }
-};
-
-struct LessEqual {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x <= y;
-  }
-};
-
-struct LogAddExp {
-  template <typename T>
-  T operator()(T x, T y) {
-    if (metal::isnan(x) || metal::isnan(y)) {
-      return metal::numeric_limits<T>::quiet_NaN();
-    }
-    constexpr T inf = metal::numeric_limits<T>::infinity();
-    T maxval = metal::max(x, y);
-    T minval = metal::min(x, y);
-    return (minval == -inf || maxval == inf)
-        ? maxval
-        : (maxval + log1p(metal::exp(minval - maxval)));
-  };
-};
-
-struct Maximum {
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
-    return metal::max(x, y);
-  }
-
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
-    if (metal::isnan(x)) {
-      return x;
-    }
-    return x > y ? x : y;
-  }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
-      return x;
-    }
-    return x > y ? x : y;
-  }
-};
-
-struct Minimum {
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
-    return metal::min(x, y);
-  }
-
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
-    if (metal::isnan(x)) {
-      return x;
-    }
-    return x < y ? x : y;
-  }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
-      return x;
-    }
-    return x < y ? x : y;
-  }
-};
-
-struct Multiply {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x * y;
-  }
-};
-
-struct NotEqual {
-  template <typename T>
-  bool operator()(T x, T y) {
-    return x != y;
-  }
-  template <>
-  bool operator()(complex64_t x, complex64_t y) {
-    return x.real != y.real || x.imag != y.imag;
-  }
-};
-
-struct Power {
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T base, T exp) {
-    return metal::pow(base, exp);
-  }
-
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T base, T exp) {
-    T res = 1;
-    while (exp) {
-      if (exp & 1) {
-        res *= base;
-      }
-      exp >>= 1;
-      base *= base;
-    }
-    return res;
-  }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    auto x_theta = metal::atan(x.imag / x.real);
-    auto x_ln_r = 0.5 * metal::log(x.real * x.real + x.imag * x.imag);
-    auto mag = metal::exp(y.real * x_ln_r - y.imag * x_theta);
-    auto phase = y.imag * x_ln_r + y.real * x_theta;
-    return {mag * metal::cos(phase), mag * metal::sin(phase)};
-  }
-};
-
-struct Subtract {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x - y;
-  }
-};
-
-struct LogicalAnd {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x && y;
-  };
-};
-
-struct LogicalOr {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x || y;
-  };
-};
-
-struct BitwiseAnd {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x & y;
-  };
-};
-
-struct BitwiseOr {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x | y;
-  };
-};
-
-struct BitwiseXor {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x ^ y;
-  };
-};
-
-struct LeftShift {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x << y;
-  };
-};
-
-struct RightShift {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x >> y;
-  };
-};
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
+  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
+  c[out_idx] = Op()(a[idx.x], b[idx.y]);
+}
--- a/mlx/backend/metal/kernels/binary.metal
+++ b/mlx/backend/metal/kernels/binary.metal
@@ -1,130 +1,24 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2024 Apple Inc.

+#include <metal_integer>
+#include <metal_math>
+
+// clang-format off
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/binary_ops.h"
 #include "mlx/backend/metal/kernels/binary.h"

-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_ss(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op()(a[0], b[0]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_sv(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op()(a[0], b[index]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_vs(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op()(a[index], b[0]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_vv(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op()(a[index], b[index]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_g_nd1(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
-    uint index [[thread_position_in_grid]]) {
-  auto a_idx = elem_to_loc_1(index, a_stride);
-  auto b_idx = elem_to_loc_1(index, b_stride);
-  c[index] = Op()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_g_nd2(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_2(index, a_strides);
-  auto b_idx = elem_to_loc_2(index, b_strides);
-  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
-  c[out_idx] = Op()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_g_nd3(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_3(index, a_strides);
-  auto b_idx = elem_to_loc_3(index, b_strides);
-  size_t out_idx =
-      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
-  c[out_idx] = Op()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op, int DIM>
-[[kernel]] void binary_op_g_nd(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    constant const int shape[DIM],
-    constant const size_t a_strides[DIM],
-    constant const size_t b_strides[DIM],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
-  size_t out_idx =
-      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
-  c[out_idx] = Op()(a[idx.x], b[idx.y]);
-}
-
-template <typename T, typename U, typename Op>
-[[kernel]] void binary_op_g(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
-    constant const int& ndim,
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
-  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
-  c[out_idx] = Op()(a[idx.x], b[idx.y]);
-}
-
 #define instantiate_binary(name, itype, otype, op, bopt)                      \
  template                                                                    \
-      [[host_name(name)]] [[kernel]] void binary_op_##bopt<itype, otype, op>( \
+      [[host_name(name)]] [[kernel]] void binary_##bopt<itype, otype, op>(    \
          device const itype* a,                                              \
          device const itype* b,                                              \
          device otype* c,                                                    \
          uint index [[thread_position_in_grid]]);

 #define instantiate_binary_g_dim(name, itype, otype, op, dims) \
-  template [[host_name(name "_" #dims)]] [[kernel]] void       \
-  binary_op_g_nd<itype, otype, op, dims>(                      \
+  template [[host_name("g" #dims name)]] [[kernel]] void       \
+  binary_g_nd<itype, otype, op, dims>(                         \
      device const itype* a,                                   \
      device const itype* b,                                   \
      device otype* c,                                         \
@@ -135,16 +29,16 @@ template <typename T, typename U, typename Op>
      uint3 grid_dim [[threads_per_grid]]);

 #define instantiate_binary_g_nd(name, itype, otype, op) \
-  template [[host_name(name "_1")]] [[kernel]] void     \
-  binary_op_g_nd1<itype, otype, op>(                    \
+  template [[host_name("g1" name)]] [[kernel]] void     \
+  binary_g_nd1<itype, otype, op>(                       \
      device const itype* a,                            \
      device const itype* b,                            \
      device otype* c,                                  \
      constant const size_t& a_stride,                  \
      constant const size_t& b_stride,                  \
      uint index [[thread_position_in_grid]]);          \
-  template [[host_name(name "_2")]] [[kernel]] void     \
-  binary_op_g_nd2<itype, otype, op>(                    \
+  template [[host_name("g2" name)]] [[kernel]] void     \
+  binary_g_nd2<itype, otype, op>(                       \
      device const itype* a,                            \
      device const itype* b,                            \
      device otype* c,                                  \
@@ -152,8 +46,8 @@ template <typename T, typename U, typename Op>
      constant const size_t b_strides[2],               \
      uint2 index [[thread_position_in_grid]],          \
      uint2 grid_dim [[threads_per_grid]]);             \
-  template [[host_name(name "_3")]] [[kernel]] void     \
-  binary_op_g_nd3<itype, otype, op>(                    \
+  template [[host_name("g3" name)]] [[kernel]] void     \
+  binary_g_nd3<itype, otype, op>(                       \
      device const itype* a,                            \
      device const itype* b,                            \
      device otype* c,                                  \
@@ -162,30 +56,28 @@ template <typename T, typename U, typename Op>
      uint3 index [[thread_position_in_grid]],          \
      uint3 grid_dim [[threads_per_grid]]);             \
  instantiate_binary_g_dim(name, itype, otype, op, 4)   \
-      instantiate_binary_g_dim(name, itype, otype, op, 5)
+  instantiate_binary_g_dim(name, itype, otype, op, 5)

-#define instantiate_binary_g(name, itype, otype, op)                          \
-  template [[host_name(name)]] [[kernel]] void binary_op_g<itype, otype, op>( \
-      device const itype* a,                                                  \
-      device const itype* b,                                                  \
-      device otype* c,                                                        \
-      constant const int* shape,                                              \
-      constant const size_t* a_strides,                                       \
-      constant const size_t* b_strides,                                       \
-      constant const int& ndim,                                               \
-      uint3 index [[thread_position_in_grid]],                                \
+#define instantiate_binary_g(name, itype, otype, op)                            \
+  template [[host_name("gn" name)]] [[kernel]] void binary_g<itype, otype, op>( \
+      device const itype* a,                                                    \
+      device const itype* b,                                                    \
+      device otype* c,                                                          \
+      constant const int* shape,                                                \
+      constant const size_t* a_strides,                                         \
+      constant const size_t* b_strides,                                         \
+      constant const int& ndim,                                                 \
+      uint3 index [[thread_position_in_grid]],                                  \
      uint3 grid_dim [[threads_per_grid]]);

-// clang-format off
 #define instantiate_binary_all(name, tname, itype, otype, op) \
  instantiate_binary("ss" #name #tname, itype, otype, op, ss) \
  instantiate_binary("sv" #name #tname, itype, otype, op, sv) \
  instantiate_binary("vs" #name #tname, itype, otype, op, vs) \
  instantiate_binary("vv" #name #tname, itype, otype, op, vv) \
-  instantiate_binary_g("g" #name #tname, itype, otype, op)    \
-  instantiate_binary_g_nd("g" #name #tname, itype, otype, op) // clang-format on
+  instantiate_binary_g(#name #tname, itype, otype, op)        \
+  instantiate_binary_g_nd(#name #tname, itype, otype, op)

-// clang-format off
 #define instantiate_binary_integer(name, op)                   \
  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op)    \
  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op) \
@@ -194,22 +86,19 @@ template <typename T, typename U, typename Op>
  instantiate_binary_all(name, int8, int8_t, int8_t, op)       \
  instantiate_binary_all(name, int16, int16_t, int16_t, op)    \
  instantiate_binary_all(name, int32, int32_t, int32_t, op)    \
-  instantiate_binary_all(name, int64, int64_t, int64_t, op) // clang-format on
+  instantiate_binary_all(name, int64, int64_t, int64_t, op)

-// clang-format off
 #define instantiate_binary_float(name, op)                \
  instantiate_binary_all(name, float16, half, half, op)   \
  instantiate_binary_all(name, float32, float, float, op) \
-  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op) // clang-format on
+  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op)

-// clang-format off
 #define instantiate_binary_types(name, op)                              \
  instantiate_binary_all(name, bool_, bool, bool, op)                   \
  instantiate_binary_integer(name, op)                                  \
  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op) \
-  instantiate_binary_float(name, op) // clang-format on
+  instantiate_binary_float(name, op)

-// clang-format off
 #define instantiate_binary_types_bool(name, op)                \
  instantiate_binary_all(name, bool_, bool, bool, op)          \
  instantiate_binary_all(name, uint8, uint8_t, bool, op)       \
@@ -223,9 +112,8 @@ template <typename T, typename U, typename Op>
  instantiate_binary_all(name, float16, half, bool, op)        \
  instantiate_binary_all(name, float32, float, bool, op)       \
  instantiate_binary_all(name, bfloat16, bfloat16_t, bool, op) \
-  instantiate_binary_all(name, complex64, complex64_t, bool, op) // clang-format on
+  instantiate_binary_all(name, complex64, complex64_t, bool, op)

-// clang-format off
 instantiate_binary_types(add, Add)
 instantiate_binary_types(div, Divide)
 instantiate_binary_types_bool(eq, Equal)
@@ -241,6 +129,7 @@ instantiate_binary_types(mul, Multiply)
 instantiate_binary_types(sub, Subtract)
 instantiate_binary_types(pow, Power)
 instantiate_binary_types(rem, Remainder)
+instantiate_binary_float(arctan2, ArcTan2)

 // NaNEqual only needed for floating point types with boolean output
 instantiate_binary_all(naneq, float16, half, bool, NaNEqual)
--- a/mlx/backend/metal/kernels/binary_ops.h
+++ b/mlx/backend/metal/kernels/binary_ops.h
@@ -0,0 +1,296 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <metal_integer>
+#include <metal_math>
+
+struct Add {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct FloorDivide {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x / y;
+  }
+  template <>
+  float operator()(float x, float y) {
+    return trunc(x / y);
+  }
+  template <>
+  half operator()(half x, half y) {
+    return trunc(x / y);
+  }
+  template <>
+  bfloat16_t operator()(bfloat16_t x, bfloat16_t y) {
+    return trunc(x / y);
+  }
+};
+
+struct Divide {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T> & !metal::is_signed_v<T>, T>
+  operator()(T x, T y) {
+    return x % y;
+  }
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T> & metal::is_signed_v<T>, T>
+  operator()(T x, T y) {
+    auto r = x % y;
+    if (r != 0 && (r < 0 != y < 0)) {
+      r += y;
+    }
+    return r;
+  }
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    T r = fmod(x, y);
+    if (r != 0 && (r < 0 != y < 0)) {
+      r += y;
+    }
+    return r;
+  }
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    return x % y;
+  }
+};
+
+struct Equal {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y || (metal::isnan(x) && metal::isnan(y));
+  }
+  template <>
+  bool operator()(complex64_t x, complex64_t y) {
+    return x == y ||
+        (metal::isnan(x.real) && metal::isnan(y.real) && metal::isnan(x.imag) &&
+         metal::isnan(y.imag)) ||
+        (x.real == y.real && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
+        (metal::isnan(x.real) && metal::isnan(y.real) && x.imag == y.imag);
+  }
+};
+
+struct Greater {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  T operator()(T x, T y) {
+    if (metal::isnan(x) || metal::isnan(y)) {
+      return metal::numeric_limits<T>::quiet_NaN();
+    }
+    constexpr T inf = metal::numeric_limits<T>::infinity();
+    T maxval = metal::max(x, y);
+    T minval = metal::min(x, y);
+    return (minval == -inf || maxval == inf)
+        ? maxval
+        : (maxval + log1p(metal::exp(minval - maxval)));
+  };
+};
+
+struct Maximum {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::max(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x > y ? x : y;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x > y ? x : y;
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::min(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x != y;
+  }
+  template <>
+  bool operator()(complex64_t x, complex64_t y) {
+    return x.real != y.real || x.imag != y.imag;
+  }
+};
+
+struct Power {
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T base, T exp) {
+    return metal::pow(base, exp);
+  }
+
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T base, T exp) {
+    T res = 1;
+    while (exp) {
+      if (exp & 1) {
+        res *= base;
+      }
+      exp >>= 1;
+      base *= base;
+    }
+    return res;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    auto x_theta = metal::atan(x.imag / x.real);
+    auto x_ln_r = 0.5 * metal::log(x.real * x.real + x.imag * x.imag);
+    auto mag = metal::exp(y.real * x_ln_r - y.imag * x_theta);
+    auto phase = y.imag * x_ln_r + y.real * x_theta;
+    return {mag * metal::cos(phase), mag * metal::sin(phase)};
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x && y;
+  };
+};
+
+struct LogicalOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x || y;
+  };
+};
+
+struct BitwiseAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x & y;
+  };
+};
+
+struct BitwiseOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x | y;
+  };
+};
+
+struct BitwiseXor {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x ^ y;
+  };
+};
+
+struct LeftShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x << y;
+  };
+};
+
+struct RightShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x >> y;
+  };
+};
+
+struct ArcTan2 {
+  template <typename T>
+  T operator()(T y, T x) {
+    return metal::precise::atan2(y, x);
+  }
+};
+
+struct DivMod {
+  template <typename T>
+  metal::array<T, 2> operator()(T x, T y) {
+    return {FloorDivide{}(x, y), Remainder{}(x, y)};
+  };
+};
--- a/mlx/backend/metal/kernels/binary_two.h
+++ b/mlx/backend/metal/kernels/binary_two.h
@@ -0,0 +1,140 @@
+// Copyright © 2024 Apple Inc.
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_ss(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  auto out = Op()(a[0], b[0]);
+  c[index] = out[0];
+  d[index] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_sv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  auto out = Op()(a[0], b[index]);
+  c[index] = out[0];
+  d[index] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_vs(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  auto out = Op()(a[index], b[0]);
+  c[index] = out[0];
+  d[index] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_vv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  auto out = Op()(a[index], b[index]);
+  c[index] = out[0];
+  d[index] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd1(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]) {
+  auto a_idx = elem_to_loc_1(index, a_stride);
+  auto b_idx = elem_to_loc_1(index, b_stride);
+  auto out = Op()(a[a_idx], b[b_idx]);
+  c[index] = out[0];
+  d[index] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd2(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_2(index, a_strides);
+  auto b_idx = elem_to_loc_2(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
+  auto out = Op()(a[a_idx], b[b_idx]);
+  c[out_idx] = out[0];
+  d[out_idx] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g_nd3(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_3(index, a_strides);
+  auto b_idx = elem_to_loc_3(index, b_strides);
+  size_t out_idx =
+      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  auto out = Op()(a[a_idx], b[b_idx]);
+  c[out_idx] = out[0];
+  d[out_idx] = out[1];
+}
+
+template <typename T, typename U, typename Op, int DIM>
+[[kernel]] void binary_g_nd(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int shape[DIM],
+    constant const size_t a_strides[DIM],
+    constant const size_t b_strides[DIM],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
+  size_t out_idx =
+      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  auto out = Op()(a[idx.x], b[idx.y]);
+  c[out_idx] = out[0];
+  d[out_idx] = out[1];
+}
+
+template <typename T, typename U, typename Op>
+[[kernel]] void binary_g(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
+  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
+  auto out = Op()(a[idx.x], b[idx.y]);
+  c[out_idx] = out[0];
+  d[out_idx] = out[1];
+}
--- a/mlx/backend/metal/kernels/binary_two.metal
+++ b/mlx/backend/metal/kernels/binary_two.metal
@@ -1,212 +1,24 @@
-// Copyright © 2023 Apple Inc.
-
+// Copyright © 2024 Apple Inc.
 #include <metal_integer>
 #include <metal_math>

-#include "mlx/backend/metal/kernels/bf16.h"
+// clang-format off
 #include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/binary_ops.h"
+#include "mlx/backend/metal/kernels/binary_two.h"

-struct FloorDivide {
-  template <typename T>
-  T operator()(T x, T y) {
-    return x / y;
-  }
-  template <>
-  float operator()(float x, float y) {
-    return trunc(x / y);
-  }
-  template <>
-  half operator()(half x, half y) {
-    return trunc(x / y);
-  }
-  template <>
-  bfloat16_t operator()(bfloat16_t x, bfloat16_t y) {
-    return trunc(x / y);
-  }
-};
-
-struct Remainder {
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T> & !metal::is_signed_v<T>, T>
-  operator()(T x, T y) {
-    return x % y;
-  }
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T> & metal::is_signed_v<T>, T>
-  operator()(T x, T y) {
-    auto r = x % y;
-    if (r != 0 && (r < 0 != y < 0)) {
-      r += y;
-    }
-    return r;
-  }
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
-    T r = fmod(x, y);
-    if (r != 0 && (r < 0 != y < 0)) {
-      r += y;
-    }
-    return r;
-  }
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x % y;
-  }
-};
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_s2s(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op1()(a[0], b[0]);
-  d[index] = Op2()(a[0], b[0]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_ss(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op1()(a[0], b[0]);
-  d[index] = Op2()(a[0], b[0]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_sv(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op1()(a[0], b[index]);
-  d[index] = Op2()(a[0], b[index]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_vs(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op1()(a[index], b[0]);
-  d[index] = Op2()(a[index], b[0]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_vv(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    uint index [[thread_position_in_grid]]) {
-  c[index] = Op1()(a[index], b[index]);
-  d[index] = Op2()(a[index], b[index]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_g_nd1(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
-    uint index [[thread_position_in_grid]]) {
-  auto a_idx = elem_to_loc_1(index, a_stride);
-  auto b_idx = elem_to_loc_1(index, b_stride);
-  c[index] = Op1()(a[a_idx], b[b_idx]);
-  d[index] = Op2()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_g_nd2(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_2(index, a_strides);
-  auto b_idx = elem_to_loc_2(index, b_strides);
-  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
-  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
-  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_g_nd3(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_3(index, a_strides);
-  auto b_idx = elem_to_loc_3(index, b_strides);
-  size_t out_idx =
-      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
-  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
-  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2, int DIM>
-[[kernel]] void binary_op_g_nd(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    constant const int shape[DIM],
-    constant const size_t a_strides[DIM],
-    constant const size_t b_strides[DIM],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
-  size_t out_idx =
-      index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
-  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
-  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
-}
-
-template <typename T, typename U, typename Op1, typename Op2>
-[[kernel]] void binary_op_g(
-    device const T* a,
-    device const T* b,
-    device U* c,
-    device U* d,
-    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
-    constant const int& ndim,
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
-  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
-  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
-  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
-}
-
-#define instantiate_binary(name, itype, otype, op1, op2, bopt) \
+#define instantiate_binary(name, itype, otype, op, bopt)       \
  template [[host_name(name)]] [[kernel]] void                 \
-      binary_op_##bopt<itype, otype, op1, op2>(                \
+      binary_##bopt<itype, otype, op>(                         \
          device const itype* a,                               \
          device const itype* b,                               \
          device otype* c,                                     \
          device otype* d,                                     \
          uint index [[thread_position_in_grid]]);

-#define instantiate_binary_g_dim(name, itype, otype, op1, op2, dims) \
-  template [[host_name(name "_" #dims)]] [[kernel]] void             \
-  binary_op_g_nd<itype, otype, op1, op2, dims>(                      \
+#define instantiate_binary_g_dim(name, itype, otype, op, dims)       \
+  template [[host_name("g" #dims name)]] [[kernel]] void             \
+  binary_g_nd<itype, otype, op, dims>(                               \
      device const itype* a,                                         \
      device const itype* b,                                         \
      device otype* c,                                               \
@@ -217,10 +29,9 @@ template <typename T, typename U, typename Op1, typename Op2>
      uint3 index [[thread_position_in_grid]],                       \
      uint3 grid_dim [[threads_per_grid]]);

-// clang-format off
-#define instantiate_binary_g_nd(name, itype, otype, op1, op2) \
-  template [[host_name(name "_1")]] [[kernel]] void           \
-  binary_op_g_nd1<itype, otype, op1, op2>(                    \
+#define instantiate_binary_g_nd(name, itype, otype, op)       \
+  template [[host_name("g1" name)]] [[kernel]] void           \
+  binary_g_nd1<itype, otype, op>(                             \
      device const itype* a,                                  \
      device const itype* b,                                  \
      device otype* c,                                        \
@@ -228,8 +39,8 @@ template <typename T, typename U, typename Op1, typename Op2>
      constant const size_t& a_stride,                        \
      constant const size_t& b_stride,                        \
      uint index [[thread_position_in_grid]]);                \
-  template [[host_name(name "_2")]] [[kernel]] void           \
-  binary_op_g_nd2<itype, otype, op1, op2>(                    \
+  template [[host_name("g2" name)]] [[kernel]] void           \
+  binary_g_nd2<itype, otype, op>(                             \
      device const itype* a,                                  \
      device const itype* b,                                  \
      device otype* c,                                        \
@@ -238,8 +49,8 @@ template <typename T, typename U, typename Op1, typename Op2>
      constant const size_t b_strides[2],                     \
      uint2 index [[thread_position_in_grid]],                \
      uint2 grid_dim [[threads_per_grid]]);                   \
-  template [[host_name(name "_3")]] [[kernel]] void           \
-  binary_op_g_nd3<itype, otype, op1, op2>(                    \
+  template [[host_name("g3" name)]] [[kernel]] void           \
+  binary_g_nd3<itype, otype, op>(                             \
      device const itype* a,                                  \
      device const itype* b,                                  \
      device otype* c,                                        \
@@ -248,12 +59,12 @@ template <typename T, typename U, typename Op1, typename Op2>
      constant const size_t b_strides[3],                     \
      uint3 index [[thread_position_in_grid]],                \
      uint3 grid_dim [[threads_per_grid]]);                   \
-  instantiate_binary_g_dim(name, itype, otype, op1, op2, 4)   \
-  instantiate_binary_g_dim(name, itype, otype, op1, op2, 5) // clang-format on
+  instantiate_binary_g_dim(name, itype, otype, op, 4)         \
+  instantiate_binary_g_dim(name, itype, otype, op, 5)

-#define instantiate_binary_g(name, itype, otype, op1, op2) \
-  template [[host_name(name)]] [[kernel]] void             \
-  binary_op_g<itype, otype, op2, op2>(                     \
+#define instantiate_binary_g(name, itype, otype, op)       \
+  template [[host_name("gn" name)]] [[kernel]] void        \
+  binary_g<itype, otype, op>(                              \
      device const itype* a,                               \
      device const itype* b,                               \
      device otype* c,                                     \
@@ -265,33 +76,30 @@ template <typename T, typename U, typename Op1, typename Op2>
      uint3 index [[thread_position_in_grid]],             \
      uint3 grid_dim [[threads_per_grid]]);

-// clang-format off
-#define instantiate_binary_all(name, tname, itype, otype, op1, op2) \
-  instantiate_binary("ss" #name #tname, itype, otype, op1, op2, ss) \
-  instantiate_binary("sv" #name #tname, itype, otype, op1, op2, sv) \
-  instantiate_binary("vs" #name #tname, itype, otype, op1, op2, vs) \
-  instantiate_binary("vv" #name #tname, itype, otype, op1, op2, vv) \
-  instantiate_binary_g("g" #name #tname, itype, otype, op1, op2)    \
-  instantiate_binary_g_nd("g" #name #tname, itype, otype, op1, op2) // clang-format on
+#define instantiate_binary_all(name, tname, itype, otype, op) \
+  instantiate_binary("ss" #name #tname, itype, otype, op, ss) \
+  instantiate_binary("sv" #name #tname, itype, otype, op, sv) \
+  instantiate_binary("vs" #name #tname, itype, otype, op, vs) \
+  instantiate_binary("vv" #name #tname, itype, otype, op, vv) \
+  instantiate_binary_g(#name #tname, itype, otype, op)    \
+  instantiate_binary_g_nd(#name #tname, itype, otype, op)

-// clang-format off
-#define instantiate_binary_float(name, op1, op2)                \
-  instantiate_binary_all(name, float16, half, half, op1, op2)   \
-  instantiate_binary_all(name, float32, float, float, op1, op2) \
-  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op1, op2) // clang-format on
+#define instantiate_binary_float(name, op)                \
+  instantiate_binary_all(name, float16, half, half, op)   \
+  instantiate_binary_all(name, float32, float, float, op) \
+  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op)

-// clang-format off
-#define instantiate_binary_types(name, op1, op2)                              \
-  instantiate_binary_all(name, bool_, bool, bool, op1, op2)                   \
-  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op1, op2)             \
-  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op1, op2)          \
-  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op1, op2)          \
-  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op1, op2)          \
-  instantiate_binary_all(name, int8, int8_t, int8_t, op1, op2)                \
-  instantiate_binary_all(name, int16, int16_t, int16_t, op1, op2)             \
-  instantiate_binary_all(name, int32, int32_t, int32_t, op1, op2)             \
-  instantiate_binary_all(name, int64, int64_t, int64_t, op1, op2)             \
-  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op1, op2) \
-  instantiate_binary_float(name, op1, op2)
+#define instantiate_binary_types(name, op)                              \
+  instantiate_binary_all(name, bool_, bool, bool, op)                   \
+  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op)             \
+  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op)          \
+  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op)          \
+  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op)          \
+  instantiate_binary_all(name, int8, int8_t, int8_t, op)                \
+  instantiate_binary_all(name, int16, int16_t, int16_t, op)             \
+  instantiate_binary_all(name, int32, int32_t, int32_t, op)             \
+  instantiate_binary_all(name, int64, int64_t, int64_t, op)             \
+  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op) \
+  instantiate_binary_float(name, op)

-instantiate_binary_types(divmod, FloorDivide, Remainder) // clang-format on
+instantiate_binary_types(divmod, DivMod) // clang-format on
--- a/mlx/backend/metal/kernels/compiled_preamble.h
+++ b/mlx/backend/metal/kernels/compiled_preamble.h
@@ -1,7 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include "mlx/backend/metal/kernels/binary.h"
-#include "mlx/backend/metal/kernels/ternary.h"
-#include "mlx/backend/metal/kernels/unary.h"
-
-typedef half float16_t;
--- a/mlx/backend/metal/kernels/conv.metal
+++ b/mlx/backend/metal/kernels/conv.metal
@@ -109,6 +109,7 @@ template <typename T, int N>
  bool valid = n < params->N;

  // Unroll dimensions
+  int kernel_stride = 1;
  for (int i = N - 1; i >= 0; --i) {
    int os_ = (oS % params->oS[i]);
    int ws_ = (wS % params->wS[i]);
@@ -125,7 +126,8 @@ template <typename T, int N>
    oS /= params->oS[i];
    wS /= params->wS[i];

-    out += ws_ * params->str[i];
+    out += ws_ * kernel_stride;
+    kernel_stride *= params->wS[i];
  }

  if (valid) {
@@ -648,4 +650,4 @@ winograd_conv_2d_output_transform(

 // clang-format off
 instantiate_winograd_conv_2d(float32, float);
-instantiate_winograd_conv_2d(float16, half); // clang-format on
+instantiate_winograd_conv_2d(float16, half); // clang-format on
--- a/mlx/backend/metal/kernels/copy.h
+++ b/mlx/backend/metal/kernels/copy.h
@@ -0,0 +1,144 @@
+// Copyright © 2024 Apple Inc.
+
+template <typename T, typename U>
+[[kernel]] void copy_s(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    uint index [[thread_position_in_grid]]) {
+  dst[index] = static_cast<U>(src[0]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_v(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    uint index [[thread_position_in_grid]]) {
+  dst[index] = static_cast<U>(src[index]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_g_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1(index, src_stride);
+  dst[index] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_g_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc_2(index, src_strides);
+  int64_t dst_idx = index.x + (int64_t)grid_dim.x * index.y;
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_g_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc_3(index, src_strides);
+  int64_t dst_idx =
+      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U, int DIM>
+[[kernel]] void copy_g_nd(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc_nd<DIM>(index, src_shape, src_strides);
+  int64_t dst_idx =
+      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_g(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc(index, src_shape, src_strides, ndim);
+  int64_t dst_idx =
+      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_gg_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    constant const int64_t& dst_stride [[buffer(4)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1(index, src_stride);
+  auto dst_idx = elem_to_loc_1(index, dst_stride);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_gg_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint2 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_2(index, src_strides);
+  auto dst_idx = elem_to_loc_2(index, dst_strides);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_gg_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_3(index, src_strides);
+  auto dst_idx = elem_to_loc_3(index, dst_strides);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U, int DIM>
+[[kernel]] void copy_gg_nd(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_nd<DIM>(index, src_shape, src_strides);
+  auto dst_idx = elem_to_loc_nd<DIM>(index, src_shape, dst_strides);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+
+template <typename T, typename U>
+[[kernel]] void copy_gg(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc(index, src_shape, src_strides, ndim);
+  auto dst_idx = elem_to_loc(index, src_shape, dst_strides, ndim);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
--- a/mlx/backend/metal/kernels/copy.metal
+++ b/mlx/backend/metal/kernels/copy.metal
@@ -1,150 +1,9 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2024 Apple Inc.

-#include "mlx/backend/metal/kernels/bf16.h"
+// clang-format off
 #include "mlx/backend/metal/kernels/utils.h"
-
-template <typename T, typename U>
-[[kernel]] void copy_s(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    uint index [[thread_position_in_grid]]) {
-  dst[index] = static_cast<U>(src[0]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_v(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    uint index [[thread_position_in_grid]]) {
-  dst[index] = static_cast<U>(src[index]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_g_nd1(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t& src_stride [[buffer(3)]],
-    uint index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_1(index, src_stride);
-  dst[index] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_g_nd2(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc_2(index, src_strides);
-  int64_t dst_idx = index.x + (int64_t)grid_dim.x * index.y;
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_g_nd3(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc_3(index, src_strides);
-  int64_t dst_idx =
-      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U, int DIM>
-[[kernel]] void copy_g_nd(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc_nd<DIM>(index, src_shape, src_strides);
-  int64_t dst_idx =
-      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_g(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int& ndim [[buffer(5)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc(index, src_shape, src_strides, ndim);
-  int64_t dst_idx =
-      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_gg_nd1(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t& src_stride [[buffer(3)]],
-    constant const int64_t& dst_stride [[buffer(4)]],
-    uint index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_1(index, src_stride);
-  auto dst_idx = elem_to_loc_1(index, dst_stride);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_gg_nd2(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint2 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_2(index, src_strides);
-  auto dst_idx = elem_to_loc_2(index, dst_strides);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_gg_nd3(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint3 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_3(index, src_strides);
-  auto dst_idx = elem_to_loc_3(index, dst_strides);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U, int DIM>
-[[kernel]] void copy_gg_nd(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint3 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_nd<DIM>(index, src_shape, src_strides);
-  auto dst_idx = elem_to_loc_nd<DIM>(index, src_shape, dst_strides);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
-
-template <typename T, typename U>
-[[kernel]] void copy_gg(
-    device const T* src [[buffer(0)]],
-    device U* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    constant const int& ndim [[buffer(5)]],
-    uint3 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc(index, src_shape, src_strides, ndim);
-  auto dst_idx = elem_to_loc(index, src_shape, dst_strides, ndim);
-  dst[dst_idx] = static_cast<U>(src[src_idx]);
-}
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/copy.h"

 #define instantiate_copy(name, itype, otype, ctype)                        \
  template [[host_name(name)]] [[kernel]] void copy_##ctype<itype, otype>( \
@@ -152,92 +11,90 @@ template <typename T, typename U>
      device otype* dst [[buffer(1)]],                                     \
      uint index [[thread_position_in_grid]]);

-#define instantiate_copy_g_dim(name, itype, otype, dims)     \
-  template [[host_name(name "_" #dims)]] [[kernel]] void     \
-  copy_g_nd<itype, otype, dims>(                             \
-      device const itype* src [[buffer(0)]],                 \
-      device otype* dst [[buffer(1)]],                       \
-      constant const int* src_shape [[buffer(2)]],           \
-      constant const int64_t* src_strides [[buffer(3)]],     \
-      uint3 index [[thread_position_in_grid]],               \
-      uint3 grid_dim [[threads_per_grid]]);                  \
-  template [[host_name("g" name "_" #dims)]] [[kernel]] void \
-  copy_gg_nd<itype, otype, dims>(                            \
-      device const itype* src [[buffer(0)]],                 \
-      device otype* dst [[buffer(1)]],                       \
-      constant const int* src_shape [[buffer(2)]],           \
-      constant const int64_t* src_strides [[buffer(3)]],     \
-      constant const int64_t* dst_strides [[buffer(4)]],     \
+#define instantiate_copy_g_dim(name, itype, otype, dims)      \
+  template [[host_name("g" #dims "_" name)]] [[kernel]] void  \
+  copy_g_nd<itype, otype, dims>(                              \
+      device const itype* src [[buffer(0)]],                  \
+      device otype* dst [[buffer(1)]],                        \
+      constant const int* src_shape [[buffer(2)]],            \
+      constant const int64_t* src_strides [[buffer(3)]],      \
+      uint3 index [[thread_position_in_grid]],                \
+      uint3 grid_dim [[threads_per_grid]]);                   \
+  template [[host_name("gg" #dims "_" name)]] [[kernel]] void \
+  copy_gg_nd<itype, otype, dims>(                             \
+      device const itype* src [[buffer(0)]],                  \
+      device otype* dst [[buffer(1)]],                        \
+      constant const int* src_shape [[buffer(2)]],            \
+      constant const int64_t* src_strides [[buffer(3)]],      \
+      constant const int64_t* dst_strides [[buffer(4)]],      \
      uint3 index [[thread_position_in_grid]]);

-#define instantiate_copy_g_nd(name, itype, otype)                             \
-  template [[host_name(name "_1")]] [[kernel]] void copy_g_nd1<itype, otype>( \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t& src_stride [[buffer(3)]],                       \
-      uint index [[thread_position_in_grid]]);                                \
-  template [[host_name(name "_2")]] [[kernel]] void copy_g_nd2<itype, otype>( \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t* src_strides [[buffer(3)]],                      \
-      uint2 index [[thread_position_in_grid]],                                \
-      uint2 grid_dim [[threads_per_grid]]);                                   \
-  template [[host_name(name "_3")]] [[kernel]] void copy_g_nd3<itype, otype>( \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t* src_strides [[buffer(3)]],                      \
-      uint3 index [[thread_position_in_grid]],                                \
-      uint3 grid_dim [[threads_per_grid]]);                                   \
-  template [[host_name("g" name "_1")]] [[kernel]] void                       \
-  copy_gg_nd1<itype, otype>(                                                  \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t& src_stride [[buffer(3)]],                       \
-      constant const int64_t& dst_stride [[buffer(4)]],                       \
-      uint index [[thread_position_in_grid]]);                                \
-  template [[host_name("g" name "_2")]] [[kernel]] void                       \
-  copy_gg_nd2<itype, otype>(                                                  \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t* src_strides [[buffer(3)]],                      \
-      constant const int64_t* dst_strides [[buffer(4)]],                      \
-      uint2 index [[thread_position_in_grid]]);                               \
-  template [[host_name("g" name "_3")]] [[kernel]] void                       \
-  copy_gg_nd3<itype, otype>(                                                  \
-      device const itype* src [[buffer(0)]],                                  \
-      device otype* dst [[buffer(1)]],                                        \
-      constant const int64_t* src_strides [[buffer(3)]],                      \
-      constant const int64_t* dst_strides [[buffer(4)]],                      \
-      uint3 index [[thread_position_in_grid]]);                               \
-  instantiate_copy_g_dim(name, itype, otype, 4)                               \
-      instantiate_copy_g_dim(name, itype, otype, 5)
+#define instantiate_copy_g_nd(name, itype, otype)                              \
+  template [[host_name("g1_" name)]] [[kernel]] void copy_g_nd1<itype, otype>( \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t& src_stride [[buffer(3)]],                        \
+      uint index [[thread_position_in_grid]]);                                 \
+  template [[host_name("g2_" name)]] [[kernel]] void copy_g_nd2<itype, otype>( \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t* src_strides [[buffer(3)]],                       \
+      uint2 index [[thread_position_in_grid]],                                 \
+      uint2 grid_dim [[threads_per_grid]]);                                    \
+  template [[host_name("g3_" name)]] [[kernel]] void copy_g_nd3<itype, otype>( \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t* src_strides [[buffer(3)]],                       \
+      uint3 index [[thread_position_in_grid]],                                 \
+      uint3 grid_dim [[threads_per_grid]]);                                    \
+  template [[host_name("gg1_" name )]] [[kernel]] void                         \
+  copy_gg_nd1<itype, otype>(                                                   \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t& src_stride [[buffer(3)]],                        \
+      constant const int64_t& dst_stride [[buffer(4)]],                        \
+      uint index [[thread_position_in_grid]]);                                 \
+  template [[host_name("gg2_" name)]] [[kernel]] void                          \
+  copy_gg_nd2<itype, otype>(                                                   \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t* src_strides [[buffer(3)]],                       \
+      constant const int64_t* dst_strides [[buffer(4)]],                       \
+      uint2 index [[thread_position_in_grid]]);                                \
+  template [[host_name("gg3_" name)]] [[kernel]] void                          \
+  copy_gg_nd3<itype, otype>(                                                   \
+      device const itype* src [[buffer(0)]],                                   \
+      device otype* dst [[buffer(1)]],                                         \
+      constant const int64_t* src_strides [[buffer(3)]],                       \
+      constant const int64_t* dst_strides [[buffer(4)]],                       \
+      uint3 index [[thread_position_in_grid]]);                                \
+  instantiate_copy_g_dim(name, itype, otype, 4)                                \
+  instantiate_copy_g_dim(name, itype, otype, 5)

-#define instantiate_copy_g(name, itype, otype)                            \
-  template [[host_name(name)]] [[kernel]] void copy_g<itype, otype>(      \
-      device const itype* src [[buffer(0)]],                              \
-      device otype* dst [[buffer(1)]],                                    \
-      constant const int* src_shape [[buffer(2)]],                        \
-      constant const int64_t* src_strides [[buffer(3)]],                  \
-      constant const int& ndim [[buffer(5)]],                             \
-      uint3 index [[thread_position_in_grid]],                            \
-      uint3 grid_dim [[threads_per_grid]]);                               \
-  template [[host_name("g" name)]] [[kernel]] void copy_gg<itype, otype>( \
-      device const itype* src [[buffer(0)]],                              \
-      device otype* dst [[buffer(1)]],                                    \
-      constant const int* src_shape [[buffer(2)]],                        \
-      constant const int64_t* src_strides [[buffer(3)]],                  \
-      constant const int64_t* dst_strides [[buffer(4)]],                  \
-      constant const int& ndim [[buffer(5)]],                             \
+#define instantiate_copy_g(name, itype, otype)                              \
+  template [[host_name("g_" name)]] [[kernel]] void copy_g<itype, otype>(   \
+      device const itype* src [[buffer(0)]],                                \
+      device otype* dst [[buffer(1)]],                                      \
+      constant const int* src_shape [[buffer(2)]],                          \
+      constant const int64_t* src_strides [[buffer(3)]],                    \
+      constant const int& ndim [[buffer(5)]],                               \
+      uint3 index [[thread_position_in_grid]],                              \
+      uint3 grid_dim [[threads_per_grid]]);                                 \
+  template [[host_name("gg_" name)]] [[kernel]] void copy_gg<itype, otype>( \
+      device const itype* src [[buffer(0)]],                                \
+      device otype* dst [[buffer(1)]],                                      \
+      constant const int* src_shape [[buffer(2)]],                          \
+      constant const int64_t* src_strides [[buffer(3)]],                    \
+      constant const int64_t* dst_strides [[buffer(4)]],                    \
+      constant const int& ndim [[buffer(5)]],                               \
      uint3 index [[thread_position_in_grid]]);

-// clang-format off
-#define instantiate_copy_all(tname, itype, otype)   \
-  instantiate_copy("scopy" #tname, itype, otype, s) \
-  instantiate_copy("vcopy" #tname, itype, otype, v) \
-  instantiate_copy_g("gcopy" #tname, itype, otype)  \
-  instantiate_copy_g_nd("gcopy" #tname, itype, otype) // clang-format on
+#define instantiate_copy_all(tname, itype, otype)    \
+  instantiate_copy("s_copy" #tname, itype, otype, s) \
+  instantiate_copy("v_copy" #tname, itype, otype, v) \
+  instantiate_copy_g("copy" #tname, itype, otype)    \
+  instantiate_copy_g_nd("copy" #tname, itype, otype)

-// clang-format off
 #define instantiate_copy_itype(itname, itype)                \
  instantiate_copy_all(itname ##bool_, itype, bool)          \
  instantiate_copy_all(itname ##uint8, itype, uint8_t)       \
--- a/mlx/backend/metal/kernels/defines.h
+++ b/mlx/backend/metal/kernels/defines.h
@@ -2,17 +2,14 @@

 #pragma once

-#ifdef __METAL__
+#if defined __METAL__ || defined MLX_METAL_JIT
 #define MTL_CONST constant
 #else
 #define MTL_CONST
 #endif

-static MTL_CONST constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;
-static MTL_CONST constexpr int MAX_COPY_SPECIALIZED_DIMS = 5;
 static MTL_CONST constexpr int MAX_REDUCE_SPECIALIZED_DIMS = 4;
 static MTL_CONST constexpr int REDUCE_N_READS = 16;
 static MTL_CONST constexpr int SOFTMAX_N_READS = 4;
-static MTL_CONST constexpr int SOFTMAX_LOOPED_LIMIT = 4096;
 static MTL_CONST constexpr int RMS_N_READS = 4;
 static MTL_CONST constexpr int RMS_LOOPED_LIMIT = 4096;
--- a/mlx/backend/metal/kernels/erf.h
+++ b/mlx/backend/metal/kernels/erf.h
@@ -1,7 +1,6 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
-
 #include <metal_math>

 /*
@@ -67,4 +66,4 @@ float erfinv(float a) {
    p = metal::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
  }
  return a * p;
-}
+}
--- a/mlx/backend/metal/kernels/gather.h
+++ b/mlx/backend/metal/kernels/gather.h
@@ -0,0 +1,45 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/indexing.h"
+
+template <typename T, typename IdxT, int NIDX, int IDX_NDIM>
+METAL_FUNC void gather_impl(
+    const device T* src [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    const constant int* src_shape [[buffer(2)]],
+    const constant size_t* src_strides [[buffer(3)]],
+    const constant size_t& src_ndim [[buffer(4)]],
+    const constant int* slice_sizes [[buffer(5)]],
+    const constant int* axes [[buffer(6)]],
+    const thread Indices<IdxT, NIDX>& indices,
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto ind_idx = index.x;
+  auto ind_offset = index.y;
+
+  size_t src_idx = 0;
+  for (int i = 0; i < NIDX; ++i) {
+    size_t idx_loc;
+    if (IDX_NDIM == 0) {
+      idx_loc = 0;
+    } else if (IDX_NDIM == 1) {
+      idx_loc = ind_idx * indices.strides[indices.ndim * i];
+    } else {
+      idx_loc = elem_to_loc(
+          ind_idx,
+          &indices.shapes[indices.ndim * i],
+          &indices.strides[indices.ndim * i],
+          indices.ndim);
+    }
+    auto ax = axes[i];
+    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], src_shape[ax]);
+    src_idx += idx_val * src_strides[ax];
+  }
+
+  auto src_offset = elem_to_loc(ind_offset, slice_sizes, src_strides, src_ndim);
+
+  size_t out_idx = index.y + static_cast<size_t>(grid_dim.y) * index.x;
+  out[out_idx] = src[src_offset + src_idx];
+}
--- a/mlx/backend/metal/kernels/gather.metal
+++ b/mlx/backend/metal/kernels/gather.metal
@@ -1,173 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include <metal_atomic>
-
-#include "mlx/backend/metal/kernels/bf16.h"
-#include "mlx/backend/metal/kernels/indexing.h"
-#include "mlx/backend/metal/kernels/utils.h"
-
-using namespace metal;
-
-/////////////////////////////////////////////////////////////////////
-// Gather kernel
-/////////////////////////////////////////////////////////////////////
-
-template <typename T, typename IdxT, int NIDX, int IDX_NDIM>
-METAL_FUNC void gather_impl(
-    const device T* src [[buffer(0)]],
-    device T* out [[buffer(1)]],
-    const constant int* src_shape [[buffer(2)]],
-    const constant size_t* src_strides [[buffer(3)]],
-    const constant size_t& src_ndim [[buffer(4)]],
-    const constant int* slice_sizes [[buffer(5)]],
-    const constant int* axes [[buffer(6)]],
-    const thread Indices<IdxT, NIDX>& indices,
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]) {
-  auto ind_idx = index.x;
-  auto ind_offset = index.y;
-
-  size_t src_idx = 0;
-  for (int i = 0; i < NIDX; ++i) {
-    size_t idx_loc;
-    if (IDX_NDIM == 0) {
-      idx_loc = 0;
-    } else if (IDX_NDIM == 1) {
-      idx_loc = ind_idx * indices.strides[indices.ndim * i];
-    } else {
-      idx_loc = elem_to_loc(
-          ind_idx,
-          &indices.shapes[indices.ndim * i],
-          &indices.strides[indices.ndim * i],
-          indices.ndim);
-    }
-    auto ax = axes[i];
-    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], src_shape[ax]);
-    src_idx += idx_val * src_strides[ax];
-  }
-
-  auto src_offset = elem_to_loc(ind_offset, slice_sizes, src_strides, src_ndim);
-
-  size_t out_idx = index.y + static_cast<size_t>(grid_dim.y) * index.x;
-  out[out_idx] = src[src_offset + src_idx];
-}
-
-#define make_gather_impl(IDX_ARG, IDX_ARR)                     \
-  template <typename T, typename IdxT, int NIDX, int IDX_NDIM> \
-  [[kernel]] void gather(                                      \
-      const device T* src [[buffer(0)]],                       \
-      device T* out [[buffer(1)]],                             \
-      const constant int* src_shape [[buffer(2)]],             \
-      const constant size_t* src_strides [[buffer(3)]],        \
-      const constant size_t& src_ndim [[buffer(4)]],           \
-      const constant int* slice_sizes [[buffer(5)]],           \
-      const constant int* axes [[buffer(6)]],                  \
-      const constant int* idx_shapes [[buffer(7)]],            \
-      const constant size_t* idx_strides [[buffer(8)]],        \
-      const constant int& idx_ndim [[buffer(9)]],              \
-      IDX_ARG(IdxT) uint2 index [[thread_position_in_grid]],   \
-      uint2 grid_dim [[threads_per_grid]]) {                   \
-    Indices<IdxT, NIDX> idxs{                                  \
-        {{IDX_ARR()}}, idx_shapes, idx_strides, idx_ndim};     \
-                                                               \
-    return gather_impl<T, IdxT, NIDX, IDX_NDIM>(               \
-        src,                                                   \
-        out,                                                   \
-        src_shape,                                             \
-        src_strides,                                           \
-        src_ndim,                                              \
-        slice_sizes,                                           \
-        axes,                                                  \
-        idxs,                                                  \
-        index,                                                 \
-        grid_dim);                                             \
-  }
-
-#define make_gather(n) make_gather_impl(IDX_ARG_##n, IDX_ARR_##n)
-
-make_gather(0) make_gather(1) make_gather(2) make_gather(3) make_gather(4)
-    make_gather(5) make_gather(6) make_gather(7) make_gather(8) make_gather(9)
-        make_gather(10)
-
-/////////////////////////////////////////////////////////////////////
-// Gather instantiations
-/////////////////////////////////////////////////////////////////////
-
-#define instantiate_gather6(name, src_t, idx_t, nidx, IDX_ARG, nd, nd_name)   \
-  template [[host_name("gather" name "_" #nidx "" #nd_name)]] [[kernel]] void \
-  gather<src_t, idx_t, nidx, nd>(                                             \
-      const device src_t* src [[buffer(0)]],                                  \
-      device src_t* out [[buffer(1)]],                                        \
-      const constant int* src_shape [[buffer(2)]],                            \
-      const constant size_t* src_strides [[buffer(3)]],                       \
-      const constant size_t& src_ndim [[buffer(4)]],                          \
-      const constant int* slice_sizes [[buffer(5)]],                          \
-      const constant int* axes [[buffer(6)]],                                 \
-      const constant int* idx_shapes [[buffer(7)]],                           \
-      const constant size_t* idx_strides [[buffer(8)]],                       \
-      const constant int& idx_ndim [[buffer(9)]],                             \
-      IDX_ARG(idx_t) uint2 index [[thread_position_in_grid]],                 \
-      uint2 grid_dim [[threads_per_grid]]);
-
-// clang-format off
-#define instantiate_gather5(name, src_t, idx_t, nidx, nd, nd_name) \
-  instantiate_gather6(name, src_t, idx_t, nidx, IDX_ARG_ ##nidx, nd, nd_name) // clang-format on
-
-// clang-format off
-#define instantiate_gather4(name, src_t, idx_t, nidx) \
-  instantiate_gather5(name, src_t, idx_t, nidx, 0, _0) \
-  instantiate_gather5(name, src_t, idx_t, nidx, 1, _1) \
-  instantiate_gather5(name, src_t, idx_t, nidx, 2, )
-
-
-// Special for case NIDX=0
-instantiate_gather4("bool_", bool, bool, 0)
-instantiate_gather4("uint8", uint8_t, bool, 0)
-instantiate_gather4("uint16", uint16_t, bool, 0)
-instantiate_gather4("uint32", uint32_t, bool, 0)
-instantiate_gather4("uint64", uint64_t, bool, 0)
-instantiate_gather4("int8", int8_t, bool, 0)
-instantiate_gather4("int16", int16_t, bool, 0)
-instantiate_gather4("int32", int32_t, bool, 0)
-instantiate_gather4("int64", int64_t, bool, 0)
-instantiate_gather4("float16", half, bool, 0)
-instantiate_gather4("float32", float, bool, 0)
-instantiate_gather4("bfloat16", bfloat16_t, bool, 0) // clang-format on
-
-// clang-format off
-#define instantiate_gather3(name, src_type, ind_type) \
-  instantiate_gather4(name, src_type, ind_type, 1)    \
-  instantiate_gather4(name, src_type, ind_type, 2)    \
-  instantiate_gather4(name, src_type, ind_type, 3)    \
-  instantiate_gather4(name, src_type, ind_type, 4)    \
-  instantiate_gather4(name, src_type, ind_type, 5)    \
-  instantiate_gather4(name, src_type, ind_type, 6)    \
-  instantiate_gather4(name, src_type, ind_type, 7)    \
-  instantiate_gather4(name, src_type, ind_type, 8)    \
-  instantiate_gather4(name, src_type, ind_type, 9)    \
-  instantiate_gather4(name, src_type, ind_type, 10) // clang-format on
-
-// clang-format off
-#define instantiate_gather(name, src_type)                \
-  instantiate_gather3(#name "bool_", src_type, bool)      \
-  instantiate_gather3(#name "uint8", src_type, uint8_t)   \
-  instantiate_gather3(#name "uint16", src_type, uint16_t) \
-  instantiate_gather3(#name "uint32", src_type, uint32_t) \
-  instantiate_gather3(#name "uint64", src_type, uint64_t) \
-  instantiate_gather3(#name "int8", src_type, int8_t)     \
-  instantiate_gather3(#name "int16", src_type, int16_t)   \
-  instantiate_gather3(#name "int32", src_type, int32_t)   \
-  instantiate_gather3(#name "int64", src_type, int64_t)
-
-instantiate_gather(bool_, bool)
-instantiate_gather(uint8, uint8_t)
-instantiate_gather(uint16, uint16_t)
-instantiate_gather(uint32, uint32_t)
-instantiate_gather(uint64, uint64_t)
-instantiate_gather(int8, int8_t)
-instantiate_gather(int16, int16_t)
-instantiate_gather(int32, int32_t)
-instantiate_gather(int64, int64_t)
-instantiate_gather(float16, half)
-instantiate_gather(float32, float)
-instantiate_gather(bfloat16, bfloat16_t) // clang-format on
--- a/mlx/backend/metal/kernels/indexing.h
+++ b/mlx/backend/metal/kernels/indexing.h
@@ -1,13 +1,9 @@
 // Copyright © 2023-2024 Apple Inc.

+#pragma once
+
 #include <metal_stdlib>

-using namespace metal;
-
-/////////////////////////////////////////////////////////////////////
-// Indexing utils
-/////////////////////////////////////////////////////////////////////
-
 template <typename IdxT, int NIDX>
 struct Indices {
  const array<const device IdxT*, NIDX> buffers;
@@ -24,31 +20,3 @@ METAL_FUNC size_t offset_neg_idx(IdxT idx, size_t size) {
    return (idx < 0) ? idx + size : idx;
  }
 }
-
-#define IDX_ARG_N(idx_t, n) const device idx_t *idx##n [[buffer(n)]],
-
-#define IDX_ARG_0(idx_t)
-#define IDX_ARG_1(idx_t) IDX_ARG_0(idx_t) IDX_ARG_N(idx_t, 21)
-#define IDX_ARG_2(idx_t) IDX_ARG_1(idx_t) IDX_ARG_N(idx_t, 22)
-#define IDX_ARG_3(idx_t) IDX_ARG_2(idx_t) IDX_ARG_N(idx_t, 23)
-#define IDX_ARG_4(idx_t) IDX_ARG_3(idx_t) IDX_ARG_N(idx_t, 24)
-#define IDX_ARG_5(idx_t) IDX_ARG_4(idx_t) IDX_ARG_N(idx_t, 25)
-#define IDX_ARG_6(idx_t) IDX_ARG_5(idx_t) IDX_ARG_N(idx_t, 26)
-#define IDX_ARG_7(idx_t) IDX_ARG_6(idx_t) IDX_ARG_N(idx_t, 27)
-#define IDX_ARG_8(idx_t) IDX_ARG_7(idx_t) IDX_ARG_N(idx_t, 28)
-#define IDX_ARG_9(idx_t) IDX_ARG_8(idx_t) IDX_ARG_N(idx_t, 29)
-#define IDX_ARG_10(idx_t) IDX_ARG_9(idx_t) IDX_ARG_N(idx_t, 30)
-
-#define IDX_ARR_N(n) idx##n,
-
-#define IDX_ARR_0()
-#define IDX_ARR_1() IDX_ARR_0() IDX_ARR_N(21)
-#define IDX_ARR_2() IDX_ARR_1() IDX_ARR_N(22)
-#define IDX_ARR_3() IDX_ARR_2() IDX_ARR_N(23)
-#define IDX_ARR_4() IDX_ARR_3() IDX_ARR_N(24)
-#define IDX_ARR_5() IDX_ARR_4() IDX_ARR_N(25)
-#define IDX_ARR_6() IDX_ARR_5() IDX_ARR_N(26)
-#define IDX_ARR_7() IDX_ARR_6() IDX_ARR_N(27)
-#define IDX_ARR_8() IDX_ARR_7() IDX_ARR_N(28)
-#define IDX_ARR_9() IDX_ARR_8() IDX_ARR_N(29)
-#define IDX_ARR_10() IDX_ARR_9() IDX_ARR_N(30)
--- a/mlx/backend/metal/kernels/quantized.metal
+++ b/mlx/backend/metal/kernels/quantized.metal
--- a/mlx/backend/metal/kernels/reduce.h
+++ b/mlx/backend/metal/kernels/reduce.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "mlx/backend/metal/kernels/reduction/reduce_all.h"
+#include "mlx/backend/metal/kernels/reduction/reduce_col.h"
+#include "mlx/backend/metal/kernels/reduction/reduce_row.h"
--- a/mlx/backend/metal/kernels/reduce.metal
+++ b/mlx/backend/metal/kernels/reduce.metal
@@ -0,0 +1,293 @@
+// Copyright © 2024 Apple Inc.
+
+#include <metal_atomic>
+#include <metal_simdgroup>
+
+// clang-format off
+#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/atomic.h"
+#include "mlx/backend/metal/kernels/reduction/ops.h"
+#include "mlx/backend/metal/kernels/reduction/reduce_init.h"
+#include "mlx/backend/metal/kernels/reduce.h"
+
+#define instantiate_reduce_helper_floats(inst_f, name, op) \
+  inst_f(name, float16, half, op)                          \
+  inst_f(name, float32, float, op)                         \
+  inst_f(name, bfloat16, bfloat16_t, op)
+
+#define instantiate_reduce_helper_uints(inst_f, name, op)  \
+  inst_f(name, uint8, uint8_t, op)                         \
+  inst_f(name, uint16, uint16_t, op)                       \
+  inst_f(name, uint32, uint32_t, op)
+
+#define instantiate_reduce_helper_ints(inst_f, name, op) \
+  inst_f(name, int8, int8_t, op)                         \
+  inst_f(name, int16, int16_t, op)                       \
+  inst_f(name, int32, int32_t, op)
+
+#define instantiate_reduce_helper_64b(inst_f, name, op) \
+  inst_f(name, int64, int64_t, op)                      \
+  inst_f(name, uint64, uint64_t, op)
+
+#define instantiate_reduce_helper_types(inst_f, name, op) \
+  instantiate_reduce_helper_floats(inst_f, name, op)      \
+  instantiate_reduce_helper_uints(inst_f, name, op)       \
+  instantiate_reduce_helper_ints(inst_f, name, op)
+
+#define instantiate_reduce_ops(inst_f, type_f) \
+  type_f(inst_f, sum, Sum)                     \
+  type_f(inst_f, prod, Prod)                   \
+  type_f(inst_f, min_, Min)                    \
+  type_f(inst_f, max_, Max)
+
+// Special case for bool reductions
+#define instantiate_reduce_from_types_helper( \
+    inst_f, name, tname, itype, otype, op)    \
+    inst_f(name##tname, itype, otype, op)
+
+#define instantiate_reduce_from_types(inst_f, name, otype, op)  \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, bool_, bool, otype, op)                       \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, uint8, uint8_t, otype, op)                    \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, uint16, uint16_t, otype, op)                  \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, uint32, uint32_t, otype, op)                  \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, uint64, uint64_t, otype, op)                  \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, int8, int8_t, otype, op)                      \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, int16, int16_t, otype, op)                    \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, int32, int32_t, otype, op)                    \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, int64, int64_t, otype, op)                    \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f, name, float16, half, otype, op)                     \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f,                                                     \
+    name,                                                       \
+    float32,                                                    \
+    float,                                                      \
+    otype,                                                      \
+    op)                                                         \
+  instantiate_reduce_from_types_helper(                         \
+    inst_f,                                                     \
+    name,                                                       \
+    bfloat16,                                                   \
+    bfloat16_t,                                                 \
+    otype,                                                      \
+    op)
+
+#define instantiate_init_reduce(name, otype, op)             \
+  template [[host_name("i_reduce_" #name)]] [[kernel]] void  \
+  init_reduce<otype, op>(                                    \
+      device otype * out [[buffer(1)]], uint tid [[thread_position_in_grid]]);
+
+#define instantiate_init_reduce_helper(name, tname, type, op) \
+  instantiate_init_reduce(name##tname, type, op<type>)
+
+instantiate_reduce_ops(instantiate_init_reduce_helper, instantiate_reduce_helper_types)
+instantiate_reduce_ops(instantiate_init_reduce_helper, instantiate_reduce_helper_64b)
+
+instantiate_init_reduce(andbool_, bool, And<bool>)
+instantiate_init_reduce(orbool_, bool, Or<bool>)
+
+#define instantiate_all_reduce(name, itype, otype, op)        \
+  template [[host_name("all_reduce_" #name)]] [[kernel]] void \
+  all_reduce<itype, otype, op>(                               \
+      const device itype* in [[buffer(0)]],                   \
+      device mlx_atomic<otype>* out [[buffer(1)]],            \
+      const device size_t& in_size [[buffer(2)]],             \
+      uint gid [[thread_position_in_grid]],                   \
+      uint lid [[thread_position_in_threadgroup]],            \
+      uint grid_size [[threads_per_grid]],                    \
+      uint simd_per_group [[simdgroups_per_threadgroup]],     \
+      uint simd_lane_id [[thread_index_in_simdgroup]],        \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_all_reduce_no_atomics(name, itype, otype, op)        \
+  template [[host_name("allNoAtomics_reduce_" #name)]] [[kernel]] void   \
+  all_reduce_no_atomics<itype, otype, op>(                               \
+      const device itype* in [[buffer(0)]],                              \
+      device otype* out [[buffer(1)]],                                   \
+      const device size_t& in_size [[buffer(2)]],                        \
+      uint gid [[thread_position_in_grid]],                              \
+      uint lid [[thread_position_in_threadgroup]],                       \
+      uint grid_size [[threads_per_grid]],                               \
+      uint simd_per_group [[simdgroups_per_threadgroup]],                \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                   \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],             \
+      uint thread_group_id [[threadgroup_position_in_grid]]);
+
+#define instantiate_same_all_reduce_helper(name, tname, type, op) \
+  instantiate_all_reduce(name##tname, type, type, op<type>)
+
+#define instantiate_same_all_reduce_na_helper(name, tname, type, op) \
+  instantiate_all_reduce_no_atomics(name##tname, type, type, op<type>)
+
+instantiate_reduce_ops(instantiate_same_all_reduce_helper, instantiate_reduce_helper_types)
+instantiate_reduce_ops(instantiate_same_all_reduce_na_helper, instantiate_reduce_helper_64b)
+
+instantiate_reduce_from_types(instantiate_all_reduce, and, bool, And<bool>)
+instantiate_reduce_from_types(instantiate_all_reduce, or, bool, Or<bool>)
+
+// special case bool with larger output type
+instantiate_all_reduce(sumbool_, bool, uint32_t, Sum<uint32_t>)
+
+#define instantiate_col_reduce_general(name, itype, otype, op)        \
+  template [[host_name("colGeneral_reduce_" #name)]] [[kernel]] void  \
+  col_reduce_general<itype, otype, op>(                               \
+      const device itype* in [[buffer(0)]],                           \
+      device mlx_atomic<otype>* out [[buffer(1)]],                    \
+      const constant size_t& reduction_size [[buffer(2)]],            \
+      const constant size_t& reduction_stride [[buffer(3)]],          \
+      const constant size_t& out_size [[buffer(4)]],                  \
+      const constant int* shape [[buffer(5)]],                        \
+      const constant size_t* strides [[buffer(6)]],                   \
+      const constant int& ndim [[buffer(7)]],                         \
+      threadgroup otype* local_data [[threadgroup(0)]],               \
+      uint3 tid [[threadgroup_position_in_grid]],                     \
+      uint3 lid [[thread_position_in_threadgroup]],                   \
+      uint3 lsize [[threads_per_threadgroup]]);
+
+#define instantiate_col_reduce_general_no_atomics(name, itype, otype, op)   \
+  template                                                                  \
+      [[host_name("colGeneralNoAtomics_reduce_" #name)]] [[kernel]] void    \
+      col_reduce_general_no_atomics<itype, otype, op>(                      \
+          const device itype* in [[buffer(0)]],                             \
+          device otype* out [[buffer(1)]],                                  \
+          const constant size_t& reduction_size [[buffer(2)]],              \
+          const constant size_t& reduction_stride [[buffer(3)]],            \
+          const constant size_t& out_size [[buffer(4)]],                    \
+          const constant int* shape [[buffer(5)]],                          \
+          const constant size_t* strides [[buffer(6)]],                     \
+          const constant int& ndim [[buffer(7)]],                           \
+          threadgroup otype* local_data [[threadgroup(0)]],                 \
+          uint3 tid [[threadgroup_position_in_grid]],                       \
+          uint3 lid [[thread_position_in_threadgroup]],                     \
+          uint3 gid [[thread_position_in_grid]],                            \
+          uint3 lsize [[threads_per_threadgroup]],                          \
+          uint3 gsize [[threads_per_grid]]);
+
+#define instantiate_col_reduce_small(name, itype, otype, op)        \
+  template [[host_name("colSmall_reduce_" #name)]] [[kernel]] void \
+  col_reduce_small<itype, otype, op>(                               \
+      const device itype* in [[buffer(0)]],                         \
+      device otype* out [[buffer(1)]],                              \
+      const constant size_t& reduction_size [[buffer(2)]],          \
+      const constant size_t& reduction_stride [[buffer(3)]],        \
+      const constant size_t& out_size [[buffer(4)]],                \
+      const constant int* shape [[buffer(5)]],                      \
+      const constant size_t* strides [[buffer(6)]],                 \
+      const constant int& ndim [[buffer(7)]],                       \
+      const constant size_t& non_col_reductions [[buffer(8)]],      \
+      const constant int* non_col_shapes [[buffer(9)]],             \
+      const constant size_t* non_col_strides [[buffer(10)]],        \
+      const constant int& non_col_ndim [[buffer(11)]],              \
+      uint tid [[thread_position_in_grid]]);
+
+#define instantiate_same_col_reduce_helper(name, tname, type, op)  \
+  instantiate_col_reduce_small(name ##tname, type, type, op<type>) \
+  instantiate_col_reduce_general(name ##tname, type, type, op<type>)
+
+#define instantiate_same_col_reduce_na_helper(name, tname, type, op) \
+  instantiate_col_reduce_small(name ##tname, type, type, op<type>)   \
+  instantiate_col_reduce_general_no_atomics(name ##tname, type, type, op<type>)
+
+instantiate_reduce_ops(instantiate_same_col_reduce_helper, instantiate_reduce_helper_types)
+instantiate_reduce_ops(instantiate_same_col_reduce_na_helper, instantiate_reduce_helper_64b)
+
+instantiate_col_reduce_general(sumbool_, bool, uint32_t, Sum<uint32_t>)
+instantiate_reduce_from_types(instantiate_col_reduce_general, and, bool, And<bool>)
+instantiate_reduce_from_types(instantiate_col_reduce_general, or, bool, Or<bool>)
+
+instantiate_col_reduce_small(sumbool_, bool, uint32_t, Sum<uint32_t>)
+instantiate_reduce_from_types(instantiate_col_reduce_small, and, bool, And<bool>)
+instantiate_reduce_from_types(instantiate_col_reduce_small, or, bool, Or<bool>)
+
+#define instantiate_row_reduce_small(name, itype, otype, op)                \
+  template [[host_name("rowGeneralSmall_reduce_" #name)]] [[kernel]] void   \
+  row_reduce_general_small<itype, otype, op>(                               \
+      const device itype* in [[buffer(0)]],                                 \
+      device otype* out [[buffer(1)]],                                      \
+      const constant size_t& reduction_size [[buffer(2)]],                  \
+      const constant size_t& out_size [[buffer(3)]],                        \
+      const constant size_t& non_row_reductions [[buffer(4)]],              \
+      const constant int* shape [[buffer(5)]],                              \
+      const constant size_t* strides [[buffer(6)]],                         \
+      const constant int& ndim [[buffer(7)]],                               \
+      uint lid [[thread_position_in_grid]]);                                \
+  template [[host_name("rowGeneralMed_reduce_" #name)]] [[kernel]] void     \
+  row_reduce_general_med<itype, otype, op>(                                 \
+      const device itype* in [[buffer(0)]],                                 \
+      device otype* out [[buffer(1)]],                                      \
+      const constant size_t& reduction_size [[buffer(2)]],                  \
+      const constant size_t& out_size [[buffer(3)]],                        \
+      const constant size_t& non_row_reductions [[buffer(4)]],              \
+      const constant int* shape [[buffer(5)]],                              \
+      const constant size_t* strides [[buffer(6)]],                         \
+      const constant int& ndim [[buffer(7)]],                               \
+      uint tid [[threadgroup_position_in_grid]],                            \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                      \
+      uint simd_per_group [[dispatch_simdgroups_per_threadgroup]],          \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_row_reduce_general(name, itype, otype, op)     \
+  instantiate_row_reduce_small(name, itype, otype, op)             \
+  template                                                         \
+      [[host_name("rowGeneral_reduce_" #name)]] [[kernel]] void    \
+      row_reduce_general<itype, otype, op>(                        \
+          const device itype* in [[buffer(0)]],                    \
+          device mlx_atomic<otype>* out [[buffer(1)]],             \
+          const constant size_t& reduction_size [[buffer(2)]],     \
+          const constant size_t& out_size [[buffer(3)]],           \
+          const constant size_t& non_row_reductions [[buffer(4)]], \
+          const constant int* shape [[buffer(5)]],                 \
+          const constant size_t* strides [[buffer(6)]],            \
+          const constant int& ndim [[buffer(7)]],                  \
+          uint3 lid [[thread_position_in_threadgroup]],            \
+          uint3 lsize [[threads_per_threadgroup]],                 \
+          uint3 tid [[threadgroup_position_in_grid]],              \
+          uint simd_lane_id [[thread_index_in_simdgroup]],         \
+          uint simd_per_group [[simdgroups_per_threadgroup]],      \
+          uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_row_reduce_general_no_atomics(name, itype, otype, op)   \
+  instantiate_row_reduce_small(name, itype, otype, op)                      \
+  template                                                                  \
+      [[host_name("rowGeneralNoAtomics_reduce_" #name)]] [[kernel]] void    \
+      row_reduce_general_no_atomics<itype, otype, op>(                      \
+          const device itype* in [[buffer(0)]],                             \
+          device otype* out [[buffer(1)]],                                  \
+          const constant size_t& reduction_size [[buffer(2)]],              \
+          const constant size_t& out_size [[buffer(3)]],                    \
+          const constant size_t& non_row_reductions [[buffer(4)]],          \
+          const constant int* shape [[buffer(5)]],                          \
+          const constant size_t* strides [[buffer(6)]],                     \
+          const constant int& ndim [[buffer(7)]],                           \
+          uint3 lid [[thread_position_in_threadgroup]],                     \
+          uint3 lsize [[threads_per_threadgroup]],                          \
+          uint3 gsize [[threads_per_grid]],                                 \
+          uint3 tid [[threadgroup_position_in_grid]],                       \
+          uint simd_lane_id [[thread_index_in_simdgroup]],                  \
+          uint simd_per_group [[simdgroups_per_threadgroup]],               \
+          uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_same_row_reduce_helper(name, tname, type, op) \
+  instantiate_row_reduce_general(name##tname, type, type, op<type>)
+
+#define instantiate_same_row_reduce_na_helper(name, tname, type, op) \
+  instantiate_row_reduce_general_no_atomics(name##tname, type, type, op<type>)
+
+instantiate_reduce_ops(instantiate_same_row_reduce_helper, instantiate_reduce_helper_types)
+instantiate_reduce_ops(instantiate_same_row_reduce_na_helper, instantiate_reduce_helper_64b)
+
+instantiate_reduce_from_types(instantiate_row_reduce_general, and, bool, And<bool>)
+instantiate_reduce_from_types(instantiate_row_reduce_general, or, bool, Or<bool>)
+
+instantiate_row_reduce_general(sumbool_, bool, uint32_t, Sum<uint32_t>)
+    // clang-format on
--- a/mlx/backend/metal/kernels/reduce_utils.h
+++ b/mlx/backend/metal/kernels/reduce_utils.h
@@ -0,0 +1,6 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/atomic.h"
+#include "mlx/backend/metal/kernels/reduction/ops.h"
--- a/mlx/backend/metal/kernels/reduction/kernels/reduce_init.metal
+++ b/mlx/backend/metal/kernels/reduction/kernels/reduce_init.metal
@@ -1,32 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include "mlx/backend/metal/kernels/reduction/ops.h"
-#include "mlx/backend/metal/kernels/reduction/reduce_inst.h"
-#include "mlx/backend/metal/kernels/reduction/utils.h"
-
-using namespace metal;
-
-///////////////////////////////////////////////////////////////////////////////
-// Reduce init
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename Op>
-[[kernel]] void init_reduce(
-    device T* out [[buffer(0)]],
-    uint tid [[thread_position_in_grid]]) {
-  out[tid] = Op::init;
-}
-
-#define instantiate_init_reduce(name, otype, op)                            \
-  template [[host_name("i" #name)]] [[kernel]] void init_reduce<otype, op>( \
-      device otype * out [[buffer(1)]], uint tid [[thread_position_in_grid]]);
-
-#define instantiate_init_reduce_helper(name, tname, type, op) \
-  instantiate_init_reduce(name##tname, type, op<type>)
-
-// clang-format off
-instantiate_reduce_ops(instantiate_init_reduce_helper, instantiate_reduce_helper_types)
-instantiate_reduce_ops(instantiate_init_reduce_helper, instantiate_reduce_helper_64b)
-
-instantiate_init_reduce(andbool_, bool, And)
-instantiate_init_reduce(orbool_, bool, Or) // clang-format on
--- a/mlx/backend/metal/kernels/reduction/ops.h
+++ b/mlx/backend/metal/kernels/reduction/ops.h
@@ -5,9 +5,7 @@
 #include <metal_atomic>
 #include <metal_simdgroup>

-#include "mlx/backend/metal/kernels/atomic.h"
-#include "mlx/backend/metal/kernels/bf16.h"
-#include "mlx/backend/metal/kernels/utils.h"
+static constant constexpr const uint8_t simd_size = 32;

 union bool4_or_uint {
  bool4 b;
@@ -21,6 +19,7 @@ struct None {
  }
 };

+template <typename U = bool>
 struct And {
  bool simd_reduce(bool val) {
    return simd_all(val);
@@ -58,6 +57,7 @@ struct And {
  }
 };

+template <typename U = bool>
 struct Or {
  bool simd_reduce(bool val) {
    return simd_any(val);
--- a/mlx/backend/metal/kernels/reduction/kernels/reduce_all.metal
+++ b/mlx/backend/metal/kernels/reduction/kernels/reduce_all.metal
@@ -1,11 +1,5 @@
 // Copyright © 2023-2024 Apple Inc.

-#include "mlx/backend/metal/kernels/reduction/ops.h"
-#include "mlx/backend/metal/kernels/reduction/reduce_inst.h"
-#include "mlx/backend/metal/kernels/reduction/utils.h"
-
-using namespace metal;
-
 ///////////////////////////////////////////////////////////////////////////////
 // All reduce helper
 ///////////////////////////////////////////////////////////////////////////////
@@ -139,50 +133,3 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
    out[thread_group_id] = total_val;
  }
 }
-
-#define instantiate_all_reduce(name, itype, otype, op)        \
-  template [[host_name("all_reduce_" #name)]] [[kernel]] void \
-  all_reduce<itype, otype, op>(                               \
-      const device itype* in [[buffer(0)]],                   \
-      device mlx_atomic<otype>* out [[buffer(1)]],            \
-      const device size_t& in_size [[buffer(2)]],             \
-      uint gid [[thread_position_in_grid]],                   \
-      uint lid [[thread_position_in_threadgroup]],            \
-      uint grid_size [[threads_per_grid]],                    \
-      uint simd_per_group [[simdgroups_per_threadgroup]],     \
-      uint simd_lane_id [[thread_index_in_simdgroup]],        \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_all_reduce_no_atomics(name, itype, otype, op)        \
-  template [[host_name("all_reduce_no_atomics_" #name)]] [[kernel]] void \
-  all_reduce_no_atomics<itype, otype, op>(                               \
-      const device itype* in [[buffer(0)]],                              \
-      device otype* out [[buffer(1)]],                                   \
-      const device size_t& in_size [[buffer(2)]],                        \
-      uint gid [[thread_position_in_grid]],                              \
-      uint lid [[thread_position_in_threadgroup]],                       \
-      uint grid_size [[threads_per_grid]],                               \
-      uint simd_per_group [[simdgroups_per_threadgroup]],                \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                   \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],             \
-      uint thread_group_id [[threadgroup_position_in_grid]]);
-
-///////////////////////////////////////////////////////////////////////////////
-// Instantiations
-///////////////////////////////////////////////////////////////////////////////
-
-#define instantiate_same_all_reduce_helper(name, tname, type, op) \
-  instantiate_all_reduce(name##tname, type, type, op<type>)
-
-#define instantiate_same_all_reduce_na_helper(name, tname, type, op) \
-  instantiate_all_reduce_no_atomics(name##tname, type, type, op<type>)
-
-// clang-format off
-instantiate_reduce_ops(instantiate_same_all_reduce_helper, instantiate_reduce_helper_types)
-instantiate_reduce_ops(instantiate_same_all_reduce_na_helper, instantiate_reduce_helper_64b)
-
-instantiate_reduce_from_types(instantiate_all_reduce, and, bool, And)
-instantiate_reduce_from_types(instantiate_all_reduce, or, bool, Or)
-
-// special case bool with larger output type
-instantiate_all_reduce(sumbool_, bool, uint32_t, Sum<uint32_t>) // clang-format on
--- a/mlx/backend/metal/kernels/reduction/kernels/reduce_col.metal
+++ b/mlx/backend/metal/kernels/reduction/kernels/reduce_col.metal
@@ -1,11 +1,5 @@
 // Copyright © 2023-2024 Apple Inc.

-#include "mlx/backend/metal/kernels/reduction/ops.h"
-#include "mlx/backend/metal/kernels/reduction/reduce_inst.h"
-#include "mlx/backend/metal/kernels/reduction/utils.h"
-
-using namespace metal;
-
 ///////////////////////////////////////////////////////////////////////////////
 // Small column reduce kernel
 ///////////////////////////////////////////////////////////////////////////////
@@ -52,23 +46,6 @@ template <typename T, typename U, typename Op>
  out[out_idx] = total_val;
 }

-#define instantiate_col_reduce_small(name, itype, otype, op)        \
-  template [[host_name("col_reduce_small_" #name)]] [[kernel]] void \
-  col_reduce_small<itype, otype, op>(                               \
-      const device itype* in [[buffer(0)]],                         \
-      device otype* out [[buffer(1)]],                              \
-      const constant size_t& reduction_size [[buffer(2)]],          \
-      const constant size_t& reduction_stride [[buffer(3)]],        \
-      const constant size_t& out_size [[buffer(4)]],                \
-      const constant int* shape [[buffer(5)]],                      \
-      const constant size_t* strides [[buffer(6)]],                 \
-      const constant int& ndim [[buffer(7)]],                       \
-      const constant size_t& non_col_reductions [[buffer(8)]],      \
-      const constant int* non_col_shapes [[buffer(9)]],             \
-      const constant size_t* non_col_strides [[buffer(10)]],        \
-      const constant int& non_col_ndim [[buffer(11)]],              \
-      uint tid [[thread_position_in_grid]]);
-
 ///////////////////////////////////////////////////////////////////////////////
 // Column reduce helper
 ///////////////////////////////////////////////////////////////////////////////
@@ -186,64 +163,3 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
    }
  }
 }
-
-#define instantiate_col_reduce_general(name, itype, otype, op)        \
-  template [[host_name("col_reduce_general_" #name)]] [[kernel]] void \
-  col_reduce_general<itype, otype, op>(                               \
-      const device itype* in [[buffer(0)]],                           \
-      device mlx_atomic<otype>* out [[buffer(1)]],                    \
-      const constant size_t& reduction_size [[buffer(2)]],            \
-      const constant size_t& reduction_stride [[buffer(3)]],          \
-      const constant size_t& out_size [[buffer(4)]],                  \
-      const constant int* shape [[buffer(5)]],                        \
-      const constant size_t* strides [[buffer(6)]],                   \
-      const constant int& ndim [[buffer(7)]],                         \
-      threadgroup otype* local_data [[threadgroup(0)]],               \
-      uint3 tid [[threadgroup_position_in_grid]],                     \
-      uint3 lid [[thread_position_in_threadgroup]],                   \
-      uint3 lsize [[threads_per_threadgroup]]);
-
-#define instantiate_col_reduce_general_no_atomics(name, itype, otype, op)   \
-  template                                                                  \
-      [[host_name("col_reduce_general_no_atomics_" #name)]] [[kernel]] void \
-      col_reduce_general_no_atomics<itype, otype, op>(                      \
-          const device itype* in [[buffer(0)]],                             \
-          device otype* out [[buffer(1)]],                                  \
-          const constant size_t& reduction_size [[buffer(2)]],              \
-          const constant size_t& reduction_stride [[buffer(3)]],            \
-          const constant size_t& out_size [[buffer(4)]],                    \
-          const constant int* shape [[buffer(5)]],                          \
-          const constant size_t* strides [[buffer(6)]],                     \
-          const constant int& ndim [[buffer(7)]],                           \
-          threadgroup otype* local_data [[threadgroup(0)]],                 \
-          uint3 tid [[threadgroup_position_in_grid]],                       \
-          uint3 lid [[thread_position_in_threadgroup]],                     \
-          uint3 gid [[thread_position_in_grid]],                            \
-          uint3 lsize [[threads_per_threadgroup]],                          \
-          uint3 gsize [[threads_per_grid]]);
-
-///////////////////////////////////////////////////////////////////////////////
-// Instantiations
-///////////////////////////////////////////////////////////////////////////////
-
-// clang-format off
-#define instantiate_same_col_reduce_helper(name, tname, type, op)  \
-  instantiate_col_reduce_small(name ##tname, type, type, op<type>) \
-  instantiate_col_reduce_general(name ##tname, type, type, op<type>) // clang-format on
-
-// clang-format off
-#define instantiate_same_col_reduce_na_helper(name, tname, type, op) \
-  instantiate_col_reduce_small(name ##tname, type, type, op<type>)   \
-  instantiate_col_reduce_general_no_atomics(name ##tname, type, type, op<type>) // clang-format on
-
-// clang-format off
-instantiate_reduce_ops(instantiate_same_col_reduce_helper, instantiate_reduce_helper_types)
-instantiate_reduce_ops(instantiate_same_col_reduce_na_helper, instantiate_reduce_helper_64b)
-
-instantiate_col_reduce_general(sumbool_, bool, uint32_t, Sum<uint32_t>)
-instantiate_reduce_from_types(instantiate_col_reduce_general, and, bool, And)
-instantiate_reduce_from_types(instantiate_col_reduce_general, or, bool, Or)
-
-instantiate_col_reduce_small(sumbool_, bool, uint32_t, Sum<uint32_t>)
-instantiate_reduce_from_types(instantiate_col_reduce_small, and, bool, And)
-instantiate_reduce_from_types(instantiate_col_reduce_small, or, bool, Or) // clang-format on
--- a/mlx/backend/metal/kernels/reduction/reduce_init.h
+++ b/mlx/backend/metal/kernels/reduction/reduce_init.h
@@ -0,0 +1,8 @@
+// Copyright © 2023-2024 Apple Inc.
+
+template <typename T, typename Op>
+[[kernel]] void init_reduce(
+    device T* out [[buffer(0)]],
+    uint tid [[thread_position_in_grid]]) {
+  out[tid] = Op::init;
+}
--- a/mlx/backend/metal/kernels/reduction/reduce_inst.h
+++ b/mlx/backend/metal/kernels/reduction/reduce_inst.h
@@ -1,71 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#pragma once
-
-#include <metal_atomic>
-#include <metal_simdgroup>
-
-#include "mlx/backend/metal/kernels/defines.h"
-#include "mlx/backend/metal/kernels/reduction/ops.h"
-
-#define instantiate_reduce_helper_floats(inst_f, name, op)         \
-  inst_f(name, float16, half, op) inst_f(name, float32, float, op) \
-      inst_f(name, bfloat16, bfloat16_t, op)
-
-#define instantiate_reduce_helper_uints(inst_f, name, op)             \
-  inst_f(name, uint8, uint8_t, op) inst_f(name, uint16, uint16_t, op) \
-      inst_f(name, uint32, uint32_t, op)
-
-#define instantiate_reduce_helper_ints(inst_f, name, op)          \
-  inst_f(name, int8, int8_t, op) inst_f(name, int16, int16_t, op) \
-      inst_f(name, int32, int32_t, op)
-
-#define instantiate_reduce_helper_64b(inst_f, name, op) \
-  inst_f(name, int64, int64_t, op) inst_f(name, uint64, uint64_t, op)
-
-#define instantiate_reduce_helper_types(inst_f, name, op) \
-  instantiate_reduce_helper_floats(inst_f, name, op)      \
-      instantiate_reduce_helper_uints(inst_f, name, op)   \
-          instantiate_reduce_helper_ints(inst_f, name, op)
-
-#define instantiate_reduce_ops(inst_f, type_f)        \
-  type_f(inst_f, sum, Sum) type_f(inst_f, prod, Prod) \
-      type_f(inst_f, min_, Min) type_f(inst_f, max_, Max)
-
-// Special case for bool reductions
-#define instantiate_reduce_from_types_helper( \
-    inst_f, name, tname, itype, otype, op)    \
-  inst_f(name##tname, itype, otype, op)
-
-#define instantiate_reduce_from_types(inst_f, name, otype, op)                  \
-  instantiate_reduce_from_types_helper(inst_f, name, bool_, bool, otype, op)    \
-      instantiate_reduce_from_types_helper(                                     \
-          inst_f, name, uint8, uint8_t, otype, op)                              \
-          instantiate_reduce_from_types_helper(                                 \
-              inst_f, name, uint16, uint16_t, otype, op)                        \
-              instantiate_reduce_from_types_helper(                             \
-                  inst_f, name, uint32, uint32_t, otype, op)                    \
-                  instantiate_reduce_from_types_helper(                         \
-                      inst_f, name, int8, int8_t, otype, op)                    \
-                      instantiate_reduce_from_types_helper(                     \
-                          inst_f, name, int16, int16_t, otype, op)              \
-                          instantiate_reduce_from_types_helper(                 \
-                              inst_f, name, int32, int32_t, otype, op)          \
-                              instantiate_reduce_from_types_helper(             \
-                                  inst_f, name, int64, int64_t, otype, op)      \
-                                  instantiate_reduce_from_types_helper(         \
-                                      inst_f, name, float16, half, otype, op)   \
-                                      instantiate_reduce_from_types_helper(     \
-                                          inst_f,                               \
-                                          name,                                 \
-                                          float32,                              \
-                                          float,                                \
-                                          otype,                                \
-                                          op)                                   \
-                                          instantiate_reduce_from_types_helper( \
-                                              inst_f,                           \
-                                              name,                             \
-                                              bfloat16,                         \
-                                              bfloat16_t,                       \
-                                              otype,                            \
-                                              op)
--- a/mlx/backend/metal/kernels/reduction/kernels/reduce_row.metal
+++ b/mlx/backend/metal/kernels/reduction/kernels/reduce_row.metal
@@ -1,11 +1,5 @@
 // Copyright © 2023-2024 Apple Inc.

-#include "mlx/backend/metal/kernels/reduction/ops.h"
-#include "mlx/backend/metal/kernels/reduction/reduce_inst.h"
-#include "mlx/backend/metal/kernels/reduction/utils.h"
-
-using namespace metal;
-
 ///////////////////////////////////////////////////////////////////////////////
 // Small row reductions
 ///////////////////////////////////////////////////////////////////////////////
@@ -123,33 +117,6 @@ template <typename T, typename U, typename Op>
  }
 }

-#define instantiate_row_reduce_small(name, itype, otype, op)                \
-  template [[host_name("row_reduce_general_small_" #name)]] [[kernel]] void \
-  row_reduce_general_small<itype, otype, op>(                               \
-      const device itype* in [[buffer(0)]],                                 \
-      device otype* out [[buffer(1)]],                                      \
-      const constant size_t& reduction_size [[buffer(2)]],                  \
-      const constant size_t& out_size [[buffer(3)]],                        \
-      const constant size_t& non_row_reductions [[buffer(4)]],              \
-      const constant int* shape [[buffer(5)]],                              \
-      const constant size_t* strides [[buffer(6)]],                         \
-      const constant int& ndim [[buffer(7)]],                               \
-      uint lid [[thread_position_in_grid]]);                                \
-  template [[host_name("row_reduce_general_med_" #name)]] [[kernel]] void   \
-  row_reduce_general_med<itype, otype, op>(                                 \
-      const device itype* in [[buffer(0)]],                                 \
-      device otype* out [[buffer(1)]],                                      \
-      const constant size_t& reduction_size [[buffer(2)]],                  \
-      const constant size_t& out_size [[buffer(3)]],                        \
-      const constant size_t& non_row_reductions [[buffer(4)]],              \
-      const constant int* shape [[buffer(5)]],                              \
-      const constant size_t* strides [[buffer(6)]],                         \
-      const constant int& ndim [[buffer(7)]],                               \
-      uint tid [[threadgroup_position_in_grid]],                            \
-      uint simd_lane_id [[thread_index_in_simdgroup]],                      \
-      uint simd_per_group [[dispatch_simdgroups_per_threadgroup]],          \
-      uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
 ///////////////////////////////////////////////////////////////////////////////
 // Large row reductions
 ///////////////////////////////////////////////////////////////////////////////
@@ -318,61 +285,3 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
    out[(ceildiv(gsize.y, lsize.y) * tid.x) + tid.y] = total_val;
  }
 }
-
-#define instantiate_row_reduce_general(name, itype, otype, op)     \
-  instantiate_row_reduce_small(name, itype, otype, op) template    \
-      [[host_name("row_reduce_general_" #name)]] [[kernel]] void   \
-      row_reduce_general<itype, otype, op>(                        \
-          const device itype* in [[buffer(0)]],                    \
-          device mlx_atomic<otype>* out [[buffer(1)]],             \
-          const constant size_t& reduction_size [[buffer(2)]],     \
-          const constant size_t& out_size [[buffer(3)]],           \
-          const constant size_t& non_row_reductions [[buffer(4)]], \
-          const constant int* shape [[buffer(5)]],                 \
-          const constant size_t* strides [[buffer(6)]],            \
-          const constant int& ndim [[buffer(7)]],                  \
-          uint3 lid [[thread_position_in_threadgroup]],            \
-          uint3 lsize [[threads_per_threadgroup]],                 \
-          uint3 tid [[threadgroup_position_in_grid]],              \
-          uint simd_lane_id [[thread_index_in_simdgroup]],         \
-          uint simd_per_group [[simdgroups_per_threadgroup]],      \
-          uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-#define instantiate_row_reduce_general_no_atomics(name, itype, otype, op)   \
-  instantiate_row_reduce_small(name, itype, otype, op) template             \
-      [[host_name("row_reduce_general_no_atomics_" #name)]] [[kernel]] void \
-      row_reduce_general_no_atomics<itype, otype, op>(                      \
-          const device itype* in [[buffer(0)]],                             \
-          device otype* out [[buffer(1)]],                                  \
-          const constant size_t& reduction_size [[buffer(2)]],              \
-          const constant size_t& out_size [[buffer(3)]],                    \
-          const constant size_t& non_row_reductions [[buffer(4)]],          \
-          const constant int* shape [[buffer(5)]],                          \
-          const constant size_t* strides [[buffer(6)]],                     \
-          const constant int& ndim [[buffer(7)]],                           \
-          uint3 lid [[thread_position_in_threadgroup]],                     \
-          uint3 lsize [[threads_per_threadgroup]],                          \
-          uint3 gsize [[threads_per_grid]],                                 \
-          uint3 tid [[threadgroup_position_in_grid]],                       \
-          uint simd_lane_id [[thread_index_in_simdgroup]],                  \
-          uint simd_per_group [[simdgroups_per_threadgroup]],               \
-          uint simd_group_id [[simdgroup_index_in_threadgroup]]);
-
-///////////////////////////////////////////////////////////////////////////////
-// Instantiations
-///////////////////////////////////////////////////////////////////////////////
-
-#define instantiate_same_row_reduce_helper(name, tname, type, op) \
-  instantiate_row_reduce_general(name##tname, type, type, op<type>)
-
-#define instantiate_same_row_reduce_na_helper(name, tname, type, op) \
-  instantiate_row_reduce_general_no_atomics(name##tname, type, type, op<type>)
-
-// clang-format off
-instantiate_reduce_ops(instantiate_same_row_reduce_helper, instantiate_reduce_helper_types)
-instantiate_reduce_ops(instantiate_same_row_reduce_na_helper, instantiate_reduce_helper_64b)
-
-instantiate_reduce_from_types(instantiate_row_reduce_general, and, bool, And)
-instantiate_reduce_from_types(instantiate_row_reduce_general, or, bool, Or)
-
-instantiate_row_reduce_general(sumbool_, bool, uint32_t, Sum<uint32_t>) // clang-format on
--- a/mlx/backend/metal/kernels/reduction/utils.h
+++ b/mlx/backend/metal/kernels/reduction/utils.h
@@ -1,14 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#pragma once
-
-#include <metal_atomic>
-#include <metal_simdgroup>
-
-#include "mlx/backend/metal/kernels/defines.h"
-#include "mlx/backend/metal/kernels/steel/utils.h"
-#include "mlx/backend/metal/kernels/utils.h"
-
-#include "mlx/backend/metal/kernels/reduction/ops.h"
-
-static constant constexpr const uint8_t simd_size = 32;
--- a/mlx/backend/metal/kernels/scan.h
+++ b/mlx/backend/metal/kernels/scan.h
@@ -0,0 +1,440 @@
+// Copyright © 2023-2024 Apple Inc.
+
+template <typename U>
+struct CumSum {
+  static constexpr constant U init = static_cast<U>(0);
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return a + b;
+  }
+
+  U simd_scan(U x) {
+    return simd_prefix_inclusive_sum(x);
+  }
+
+  U simd_exclusive_scan(U x) {
+    return simd_prefix_exclusive_sum(x);
+  }
+};
+
+template <typename U>
+struct CumProd {
+  static constexpr constant U init = static_cast<U>(1.0f);
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return a * b;
+  }
+
+  U simd_scan(U x) {
+    return simd_prefix_inclusive_product(x);
+  }
+
+  U simd_exclusive_scan(U x) {
+    return simd_prefix_exclusive_product(x);
+  }
+};
+
+template <>
+struct CumProd<bool> {
+  static constexpr constant bool init = true;
+
+  template <typename T>
+  bool operator()(bool a, T b) {
+    return a & static_cast<bool>(b);
+  }
+
+  bool simd_scan(bool x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      bool other = simd_shuffle_up(x, i);
+      x &= other;
+    }
+    return x;
+  }
+
+  bool simd_exclusive_scan(bool x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename U>
+struct CumMax {
+  static constexpr constant U init = Limits<U>::min;
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a >= b) ? a : b;
+  }
+
+  U simd_scan(U x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      U other = simd_shuffle_up(x, i);
+      x = (x >= other) ? x : other;
+    }
+    return x;
+  }
+
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename U>
+struct CumMin {
+  static constexpr constant U init = Limits<U>::max;
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a <= b) ? a : b;
+  }
+
+  U simd_scan(U x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      U other = simd_shuffle_up(x, i);
+      x = (x <= other) ? x : other;
+    }
+    return x;
+  }
+
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_unsafe(U values[N_READS], const device T* input) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      values[N_READS - i - 1] = input[i];
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = input[i];
+    }
+  }
+}
+
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_safe(
+    U values[N_READS],
+    const device T* input,
+    int start,
+    int total,
+    U init) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      values[N_READS - i - 1] =
+          (start + N_READS - i - 1 < total) ? input[i] : init;
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = (start + i < total) ? input[i] : init;
+    }
+  }
+}
+
+template <typename U, int N_READS, bool reverse>
+inline void write_unsafe(U values[N_READS], device U* out) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      out[i] = values[N_READS - i - 1];
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      out[i] = values[i];
+    }
+  }
+}
+
+template <typename U, int N_READS, bool reverse>
+inline void write_safe(U values[N_READS], device U* out, int start, int total) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      if (start + N_READS - i - 1 < total) {
+        out[i] = values[N_READS - i - 1];
+      }
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      if (start + i < total) {
+        out[i] = values[i];
+      }
+    }
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    bool inclusive,
+    bool reverse>
+[[kernel]] void contiguous_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  Op op;
+
+  // Position the pointers
+  in += (gid / lsize) * axis_size;
+  out += (gid / lsize) * axis_size;
+
+  // Compute the number of simd_groups
+  uint simd_groups = lsize / simd_size;
+
+  // Allocate memory
+  U prefix = Op::init;
+  U values[N_READS];
+  threadgroup U simdgroup_sums[32];
+
+  // Loop over the reduced axis in blocks of size ceildiv(axis_size,
+  // N_READS*lsize)
+  //    Read block
+  //    Compute inclusive scan of the block
+  //      Compute inclusive scan per thread
+  //      Compute exclusive scan of thread sums in simdgroup
+  //      Write simdgroup sums in SM
+  //      Compute exclusive scan of simdgroup sums
+  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread,
+  //      value
+  //    Write block
+
+  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize); r++) {
+    // Compute the block offset
+    uint offset = r * lsize * N_READS + lid * N_READS;
+
+    // Read the values
+    if (reverse) {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(
+            values, in + axis_size - offset - N_READS);
+      } else {
+        load_safe<T, U, N_READS, reverse>(
+            values,
+            in + axis_size - offset - N_READS,
+            offset,
+            axis_size,
+            Op::init);
+      }
+    } else {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(values, in + offset);
+      } else {
+        load_safe<T, U, N_READS, reverse>(
+            values, in + offset, offset, axis_size, Op::init);
+      }
+    }
+
+    // Compute an inclusive scan per thread
+    for (int i = 1; i < N_READS; i++) {
+      values[i] = op(values[i], values[i - 1]);
+    }
+
+    // Compute exclusive scan of thread sums
+    U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
+
+    // Write simdgroup_sums to SM
+    if (simd_lane_id == simd_size - 1) {
+      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute exclusive scan of simdgroup_sums
+    if (simd_group_id == 0) {
+      U prev_simdgroup = op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
+      simdgroup_sums[simd_lane_id] = prev_simdgroup;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute the output
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = op(values[i], prefix);
+      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
+      values[i] = op(values[i], prev_thread);
+    }
+
+    // Write the values
+    if (reverse) {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(
+              values, out + axis_size - offset - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + axis_size - offset - N_READS, offset, axis_size);
+        }
+      } else {
+        if (lid == 0 && offset == 0) {
+          out[axis_size - 1] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(
+              values, out + axis_size - offset - 1 - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values,
+              out + axis_size - offset - 1 - N_READS,
+              offset + 1,
+              axis_size);
+        }
+      }
+    } else {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + offset, offset, axis_size);
+        }
+      } else {
+        if (lid == 0 && offset == 0) {
+          out[0] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset + 1);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + offset + 1, offset + 1, axis_size);
+        }
+      }
+    }
+
+    // Share the prefix
+    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
+      simdgroup_sums[0] = values[N_READS - 1];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    prefix = simdgroup_sums[0];
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    bool inclusive,
+    bool reverse>
+[[kernel]] void strided_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    const constant size_t& stride [[buffer(3)]],
+    uint2 gid [[threadgroup_position_in_grid]],
+    uint2 lid [[thread_position_in_threadgroup]],
+    uint2 lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]]) {
+  Op op;
+
+  // Allocate memory
+  threadgroup U read_buffer[N_READS * 32 * 32 + N_READS * 32];
+  U values[N_READS];
+  U prefix[N_READS];
+  for (int i = 0; i < N_READS; i++) {
+    prefix[i] = Op::init;
+  }
+
+  // Compute offsets
+  int offset = gid.y * axis_size * stride;
+  int global_index_x = gid.x * lsize.y * N_READS;
+
+  for (uint j = 0; j < axis_size; j += simd_size) {
+    // Calculate the indices for the current thread
+    uint index_y = j + lid.y;
+    uint check_index_y = index_y;
+    uint index_x = global_index_x + lid.x * N_READS;
+    if (reverse) {
+      index_y = axis_size - 1 - index_y;
+    }
+
+    // Read in SM
+    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
+      for (int i = 0; i < N_READS; i++) {
+        read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
+            in[offset + index_y * stride + index_x + i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (index_x + i) < stride) {
+          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
+              in[offset + index_y * stride + index_x + i];
+        } else {
+          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
+              Op::init;
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Read strided into registers
+    for (int i = 0; i < N_READS; i++) {
+      values[i] =
+          read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i];
+    }
+    // Do we need the following barrier? Shouldn't all simd threads execute
+    // simultaneously?
+    simdgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Perform the scan
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = op.simd_scan(values[i]);
+      values[i] = op(values[i], prefix[i]);
+      prefix[i] = simd_shuffle(values[i], simd_size - 1);
+    }
+
+    // Write to SM
+    for (int i = 0; i < N_READS; i++) {
+      read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i] =
+          values[i];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Write to device memory
+    if (!inclusive) {
+      if (check_index_y == 0) {
+        if ((index_x + N_READS) < stride) {
+          for (int i = 0; i < N_READS; i++) {
+            out[offset + index_y * stride + index_x + i] = Op::init;
+          }
+        } else {
+          for (int i = 0; i < N_READS; i++) {
+            if ((index_x + i) < stride) {
+              out[offset + index_y * stride + index_x + i] = Op::init;
+            }
+          }
+        }
+      }
+      if (reverse) {
+        index_y -= 1;
+        check_index_y += 1;
+      } else {
+        index_y += 1;
+        check_index_y += 1;
+      }
+    }
+    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
+      for (int i = 0; i < N_READS; i++) {
+        out[offset + index_y * stride + index_x + i] =
+            read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (index_x + i) < stride) {
+          out[offset + index_y * stride + index_x + i] =
+              read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
+        }
+      }
+    }
+  }
+}
--- a/mlx/backend/metal/kernels/scan.metal
+++ b/mlx/backend/metal/kernels/scan.metal
@@ -1,455 +1,19 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <metal_math>
 #include <metal_simdgroup>

-#include "mlx/backend/metal/kernels/defines.h"
-#include "mlx/backend/metal/kernels/utils.h"
+// clang-format off

 using namespace metal;

-template <typename U>
-struct CumSum {
-  static constexpr constant U init = static_cast<U>(0);
-
-  template <typename T>
-  U operator()(U a, T b) {
-    return a + b;
-  }
-
-  U simd_scan(U x) {
-    return simd_prefix_inclusive_sum(x);
-  }
-
-  U simd_exclusive_scan(U x) {
-    return simd_prefix_exclusive_sum(x);
-  }
-};
-
-template <typename U>
-struct CumProd {
-  static constexpr constant U init = static_cast<U>(1.0f);
-
-  template <typename T>
-  U operator()(U a, T b) {
-    return a * b;
-  }
-
-  U simd_scan(U x) {
-    return simd_prefix_inclusive_product(x);
-  }
-
-  U simd_exclusive_scan(U x) {
-    return simd_prefix_exclusive_product(x);
-  }
-};
-
-template <>
-struct CumProd<bool> {
-  static constexpr constant bool init = true;
-
-  template <typename T>
-  bool operator()(bool a, T b) {
-    return a & static_cast<bool>(b);
-  }
-
-  bool simd_scan(bool x) {
-    for (int i = 1; i <= 16; i *= 2) {
-      bool other = simd_shuffle_up(x, i);
-      x &= other;
-    }
-    return x;
-  }
-
-  bool simd_exclusive_scan(bool x) {
-    x = simd_scan(x);
-    return simd_shuffle_and_fill_up(x, init, 1);
-  }
-};
-
-template <typename U>
-struct CumMax {
-  static constexpr constant U init = Limits<U>::min;
-
-  template <typename T>
-  U operator()(U a, T b) {
-    return (a >= b) ? a : b;
-  }
-
-  U simd_scan(U x) {
-    for (int i = 1; i <= 16; i *= 2) {
-      U other = simd_shuffle_up(x, i);
-      x = (x >= other) ? x : other;
-    }
-    return x;
-  }
-
-  U simd_exclusive_scan(U x) {
-    x = simd_scan(x);
-    return simd_shuffle_and_fill_up(x, init, 1);
-  }
-};
-
-template <typename U>
-struct CumMin {
-  static constexpr constant U init = Limits<U>::max;
-
-  template <typename T>
-  U operator()(U a, T b) {
-    return (a <= b) ? a : b;
-  }
-
-  U simd_scan(U x) {
-    for (int i = 1; i <= 16; i *= 2) {
-      U other = simd_shuffle_up(x, i);
-      x = (x <= other) ? x : other;
-    }
-    return x;
-  }
-
-  U simd_exclusive_scan(U x) {
-    x = simd_scan(x);
-    return simd_shuffle_and_fill_up(x, init, 1);
-  }
-};
-
-template <typename T, typename U, int N_READS, bool reverse>
-inline void load_unsafe(U values[N_READS], const device T* input) {
-  if (reverse) {
-    for (int i = 0; i < N_READS; i++) {
-      values[N_READS - i - 1] = input[i];
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      values[i] = input[i];
-    }
-  }
-}
-
-template <typename T, typename U, int N_READS, bool reverse>
-inline void load_safe(
-    U values[N_READS],
-    const device T* input,
-    int start,
-    int total,
-    U init) {
-  if (reverse) {
-    for (int i = 0; i < N_READS; i++) {
-      values[N_READS - i - 1] =
-          (start + N_READS - i - 1 < total) ? input[i] : init;
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      values[i] = (start + i < total) ? input[i] : init;
-    }
-  }
-}
-
-template <typename U, int N_READS, bool reverse>
-inline void write_unsafe(U values[N_READS], device U* out) {
-  if (reverse) {
-    for (int i = 0; i < N_READS; i++) {
-      out[i] = values[N_READS - i - 1];
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      out[i] = values[i];
-    }
-  }
-}
-
-template <typename U, int N_READS, bool reverse>
-inline void write_safe(U values[N_READS], device U* out, int start, int total) {
-  if (reverse) {
-    for (int i = 0; i < N_READS; i++) {
-      if (start + N_READS - i - 1 < total) {
-        out[i] = values[N_READS - i - 1];
-      }
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      if (start + i < total) {
-        out[i] = values[i];
-      }
-    }
-  }
-}
-
-template <
-    typename T,
-    typename U,
-    typename Op,
-    int N_READS,
-    bool inclusive,
-    bool reverse>
-[[kernel]] void contiguous_scan(
-    const device T* in [[buffer(0)]],
-    device U* out [[buffer(1)]],
-    const constant size_t& axis_size [[buffer(2)]],
-    uint gid [[thread_position_in_grid]],
-    uint lid [[thread_position_in_threadgroup]],
-    uint lsize [[threads_per_threadgroup]],
-    uint simd_size [[threads_per_simdgroup]],
-    uint simd_lane_id [[thread_index_in_simdgroup]],
-    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
-  Op op;
-
-  // Position the pointers
-  in += (gid / lsize) * axis_size;
-  out += (gid / lsize) * axis_size;
-
-  // Compute the number of simd_groups
-  uint simd_groups = lsize / simd_size;
-
-  // Allocate memory
-  U prefix = Op::init;
-  U values[N_READS];
-  threadgroup U simdgroup_sums[32];
-
-  // Loop over the reduced axis in blocks of size ceildiv(axis_size,
-  // N_READS*lsize)
-  //    Read block
-  //    Compute inclusive scan of the block
-  //      Compute inclusive scan per thread
-  //      Compute exclusive scan of thread sums in simdgroup
-  //      Write simdgroup sums in SM
-  //      Compute exclusive scan of simdgroup sums
-  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread,
-  //      value
-  //    Write block
-
-  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize); r++) {
-    // Compute the block offset
-    uint offset = r * lsize * N_READS + lid * N_READS;
-
-    // Read the values
-    if (reverse) {
-      if ((offset + N_READS) < axis_size) {
-        load_unsafe<T, U, N_READS, reverse>(
-            values, in + axis_size - offset - N_READS);
-      } else {
-        load_safe<T, U, N_READS, reverse>(
-            values,
-            in + axis_size - offset - N_READS,
-            offset,
-            axis_size,
-            Op::init);
-      }
-    } else {
-      if ((offset + N_READS) < axis_size) {
-        load_unsafe<T, U, N_READS, reverse>(values, in + offset);
-      } else {
-        load_safe<T, U, N_READS, reverse>(
-            values, in + offset, offset, axis_size, Op::init);
-      }
-    }
-
-    // Compute an inclusive scan per thread
-    for (int i = 1; i < N_READS; i++) {
-      values[i] = op(values[i], values[i - 1]);
-    }
-
-    // Compute exclusive scan of thread sums
-    U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
-
-    // Write simdgroup_sums to SM
-    if (simd_lane_id == simd_size - 1) {
-      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Compute exclusive scan of simdgroup_sums
-    if (simd_group_id == 0) {
-      U prev_simdgroup = op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
-      simdgroup_sums[simd_lane_id] = prev_simdgroup;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Compute the output
-    for (int i = 0; i < N_READS; i++) {
-      values[i] = op(values[i], prefix);
-      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
-      values[i] = op(values[i], prev_thread);
-    }
-
-    // Write the values
-    if (reverse) {
-      if (inclusive) {
-        if ((offset + N_READS) < axis_size) {
-          write_unsafe<U, N_READS, reverse>(
-              values, out + axis_size - offset - N_READS);
-        } else {
-          write_safe<U, N_READS, reverse>(
-              values, out + axis_size - offset - N_READS, offset, axis_size);
-        }
-      } else {
-        if (lid == 0 && offset == 0) {
-          out[axis_size - 1] = Op::init;
-        }
-        if ((offset + N_READS + 1) < axis_size) {
-          write_unsafe<U, N_READS, reverse>(
-              values, out + axis_size - offset - 1 - N_READS);
-        } else {
-          write_safe<U, N_READS, reverse>(
-              values,
-              out + axis_size - offset - 1 - N_READS,
-              offset + 1,
-              axis_size);
-        }
-      }
-    } else {
-      if (inclusive) {
-        if ((offset + N_READS) < axis_size) {
-          write_unsafe<U, N_READS, reverse>(values, out + offset);
-        } else {
-          write_safe<U, N_READS, reverse>(
-              values, out + offset, offset, axis_size);
-        }
-      } else {
-        if (lid == 0 && offset == 0) {
-          out[0] = Op::init;
-        }
-        if ((offset + N_READS + 1) < axis_size) {
-          write_unsafe<U, N_READS, reverse>(values, out + offset + 1);
-        } else {
-          write_safe<U, N_READS, reverse>(
-              values, out + offset + 1, offset + 1, axis_size);
-        }
-      }
-    }
-
-    // Share the prefix
-    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
-      simdgroup_sums[0] = values[N_READS - 1];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    prefix = simdgroup_sums[0];
-  }
-}
-
-template <
-    typename T,
-    typename U,
-    typename Op,
-    int N_READS,
-    bool inclusive,
-    bool reverse>
-[[kernel]] void strided_scan(
-    const device T* in [[buffer(0)]],
-    device U* out [[buffer(1)]],
-    const constant size_t& axis_size [[buffer(2)]],
-    const constant size_t& stride [[buffer(3)]],
-    uint2 gid [[threadgroup_position_in_grid]],
-    uint2 lid [[thread_position_in_threadgroup]],
-    uint2 lsize [[threads_per_threadgroup]],
-    uint simd_size [[threads_per_simdgroup]]) {
-  Op op;
-
-  // Allocate memory
-  threadgroup U read_buffer[N_READS * 32 * 32 + N_READS * 32];
-  U values[N_READS];
-  U prefix[N_READS];
-  for (int i = 0; i < N_READS; i++) {
-    prefix[i] = Op::init;
-  }
-
-  // Compute offsets
-  int offset = gid.y * axis_size * stride;
-  int global_index_x = gid.x * lsize.y * N_READS;
-
-  for (uint j = 0; j < axis_size; j += simd_size) {
-    // Calculate the indices for the current thread
-    uint index_y = j + lid.y;
-    uint check_index_y = index_y;
-    uint index_x = global_index_x + lid.x * N_READS;
-    if (reverse) {
-      index_y = axis_size - 1 - index_y;
-    }
-
-    // Read in SM
-    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
-      for (int i = 0; i < N_READS; i++) {
-        read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
-            in[offset + index_y * stride + index_x + i];
-      }
-    } else {
-      for (int i = 0; i < N_READS; i++) {
-        if (check_index_y < axis_size && (index_x + i) < stride) {
-          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
-              in[offset + index_y * stride + index_x + i];
-        } else {
-          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] =
-              Op::init;
-        }
-      }
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Read strided into registers
-    for (int i = 0; i < N_READS; i++) {
-      values[i] =
-          read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i];
-    }
-    // Do we need the following barrier? Shouldn't all simd threads execute
-    // simultaneously?
-    simdgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Perform the scan
-    for (int i = 0; i < N_READS; i++) {
-      values[i] = op.simd_scan(values[i]);
-      values[i] = op(values[i], prefix[i]);
-      prefix[i] = simd_shuffle(values[i], simd_size - 1);
-    }
-
-    // Write to SM
-    for (int i = 0; i < N_READS; i++) {
-      read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i] =
-          values[i];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // Write to device memory
-    if (!inclusive) {
-      if (check_index_y == 0) {
-        if ((index_x + N_READS) < stride) {
-          for (int i = 0; i < N_READS; i++) {
-            out[offset + index_y * stride + index_x + i] = Op::init;
-          }
-        } else {
-          for (int i = 0; i < N_READS; i++) {
-            if ((index_x + i) < stride) {
-              out[offset + index_y * stride + index_x + i] = Op::init;
-            }
-          }
-        }
-      }
-      if (reverse) {
-        index_y -= 1;
-        check_index_y += 1;
-      } else {
-        index_y += 1;
-        check_index_y += 1;
-      }
-    }
-    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
-      for (int i = 0; i < N_READS; i++) {
-        out[offset + index_y * stride + index_x + i] =
-            read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
-      }
-    } else {
-      for (int i = 0; i < N_READS; i++) {
-        if (check_index_y < axis_size && (index_x + i) < stride) {
-          out[offset + index_y * stride + index_x + i] =
-              read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
-        }
-      }
-    }
-  }
-}
+#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/scan.h"

 #define instantiate_contiguous_scan(                                    \
    name, itype, otype, op, inclusive, reverse, nreads)                 \
-  template [[host_name("contiguous_scan_" #name)]] [[kernel]] void      \
+  template [[host_name("contig_scan_" #name)]] [[kernel]] void      \
  contiguous_scan<itype, otype, op<otype>, nreads, inclusive, reverse>( \
      const device itype* in [[buffer(0)]],                             \
      device otype* out [[buffer(1)]],                                  \
@@ -474,7 +38,6 @@ template <
      uint2 lsize [[threads_per_threadgroup]],                       \
      uint simd_size [[threads_per_simdgroup]]);

-// clang-format off
 #define instantiate_scan_helper(name, itype, otype, op, nreads)                                \
  instantiate_contiguous_scan(inclusive_##name, itype, otype, op, true, false, nreads)         \
  instantiate_contiguous_scan(exclusive_##name, itype, otype, op, false, false, nreads)        \
@@ -483,9 +46,8 @@ template <
  instantiate_strided_scan(inclusive_##name, itype, otype, op, true, false, nreads)            \
  instantiate_strided_scan(exclusive_##name, itype, otype, op, false, false, nreads)           \
  instantiate_strided_scan(reverse_inclusive_##name, itype, otype, op, true, true, nreads)     \
-  instantiate_strided_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads) // clang-format on
+  instantiate_strided_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads)

-// clang-format off
 instantiate_scan_helper(sum_bool__int32,         bool,        int32_t,     CumSum, 4)
 instantiate_scan_helper(sum_uint8_uint8,         uint8_t,     uint8_t,     CumSum, 4)
 instantiate_scan_helper(sum_uint16_uint16,       uint16_t,    uint16_t,    CumSum, 4)
@@ -537,4 +99,4 @@ instantiate_scan_helper(min_int32_int32,         int32_t,     int32_t,     CumMi
 instantiate_scan_helper(min_float16_float16,     half,        half,        CumMin, 4)
 instantiate_scan_helper(min_float32_float32,     float,       float,       CumMin, 4)
 instantiate_scan_helper(min_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumMin, 4)
-//instantiate_scan_helper(min_complex64_complex64, complex64_t, complex64_t, CumMin) // clang-format on
+//instantiate_scan_helper(min_complex64_complex64, complex64_t, complex64_t, CumMin) // clang-format on
--- a/mlx/backend/metal/kernels/scatter.h
+++ b/mlx/backend/metal/kernels/scatter.h
@@ -0,0 +1,66 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/indexing.h"
+
+template <typename T, typename IdxT, typename Op, int NIDX>
+METAL_FUNC void scatter_1d_index_impl(
+    const device T* updates [[buffer(1)]],
+    device mlx_atomic<T>* out [[buffer(2)]],
+    const constant int* out_shape [[buffer(3)]],
+    const constant size_t* out_strides [[buffer(4)]],
+    const constant size_t& upd_size [[buffer(5)]],
+    const thread array<const device IdxT*, NIDX>& idx_buffers,
+    uint2 gid [[thread_position_in_grid]]) {
+  Op op;
+
+  uint out_idx = 0;
+  for (int i = 0; i < NIDX; i++) {
+    auto idx_val = offset_neg_idx(idx_buffers[i][gid.y], out_shape[i]);
+    out_idx += idx_val * out_strides[i];
+  }
+
+  op.atomic_update(out, updates[gid.y * upd_size + gid.x], out_idx + gid.x);
+}
+
+template <typename T, typename IdxT, typename Op, int NIDX>
+METAL_FUNC void scatter_impl(
+    const device T* updates [[buffer(1)]],
+    device mlx_atomic<T>* out [[buffer(2)]],
+    const constant int* upd_shape [[buffer(3)]],
+    const constant size_t* upd_strides [[buffer(4)]],
+    const constant size_t& upd_ndim [[buffer(5)]],
+    const constant size_t& upd_size [[buffer(6)]],
+    const constant int* out_shape [[buffer(7)]],
+    const constant size_t* out_strides [[buffer(8)]],
+    const constant size_t& out_ndim [[buffer(9)]],
+    const constant int* axes [[buffer(10)]],
+    const thread Indices<IdxT, NIDX>& indices,
+    uint2 gid [[thread_position_in_grid]]) {
+  Op op;
+  auto ind_idx = gid.y;
+  auto ind_offset = gid.x;
+
+  size_t out_idx = 0;
+  for (int i = 0; i < NIDX; ++i) {
+    auto idx_loc = elem_to_loc(
+        ind_idx,
+        &indices.shapes[indices.ndim * i],
+        &indices.strides[indices.ndim * i],
+        indices.ndim);
+    auto ax = axes[i];
+    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], out_shape[ax]);
+    out_idx += idx_val * out_strides[ax];
+  }
+
+  if (upd_size > 1) {
+    auto out_offset = elem_to_loc(
+        ind_offset, upd_shape + indices.ndim, out_strides, out_ndim);
+    out_idx += out_offset;
+  }
+
+  auto upd_idx =
+      elem_to_loc(gid.y * upd_size + gid.x, upd_shape, upd_strides, upd_ndim);
+  op.atomic_update(out, updates[upd_idx], out_idx);
+}
--- a/mlx/backend/metal/kernels/scatter.metal
+++ b/mlx/backend/metal/kernels/scatter.metal
@@ -1,236 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include <metal_atomic>
-
-#include "mlx/backend/metal/kernels/bf16.h"
-#include "mlx/backend/metal/kernels/indexing.h"
-#include "mlx/backend/metal/kernels/reduction/ops.h"
-#include "mlx/backend/metal/kernels/utils.h"
-
-using namespace metal;
-
-/////////////////////////////////////////////////////////////////////
-// Scatter kernel
-/////////////////////////////////////////////////////////////////////
-
-template <typename T, typename IdxT, typename Op, int NIDX>
-METAL_FUNC void scatter_1d_index_impl(
-    const device T* updates [[buffer(1)]],
-    device mlx_atomic<T>* out [[buffer(2)]],
-    const constant int* out_shape [[buffer(3)]],
-    const constant size_t* out_strides [[buffer(4)]],
-    const constant size_t& upd_size [[buffer(5)]],
-    const thread array<const device IdxT*, NIDX>& idx_buffers,
-    uint2 gid [[thread_position_in_grid]]) {
-  Op op;
-
-  uint out_idx = 0;
-  for (int i = 0; i < NIDX; i++) {
-    auto idx_val = offset_neg_idx(idx_buffers[i][gid.y], out_shape[i]);
-    out_idx += idx_val * out_strides[i];
-  }
-
-  op.atomic_update(out, updates[gid.y * upd_size + gid.x], out_idx + gid.x);
-}
-
-#define make_scatter_1d_index(IDX_ARG, IDX_ARR)                            \
-  template <typename T, typename IdxT, typename Op, int NIDX>              \
-  [[kernel]] void scatter_1d_index(                                        \
-      const device T* updates [[buffer(1)]],                               \
-      device mlx_atomic<T>* out [[buffer(2)]],                             \
-      const constant int* out_shape [[buffer(3)]],                         \
-      const constant size_t* out_strides [[buffer(4)]],                    \
-      const constant size_t& upd_size [[buffer(5)]],                       \
-      IDX_ARG(IdxT) uint2 gid [[thread_position_in_grid]]) {               \
-    const array<const device IdxT*, NIDX> idx_buffers = {IDX_ARR()};       \
-                                                                           \
-    return scatter_1d_index_impl<T, IdxT, Op, NIDX>(                       \
-        updates, out, out_shape, out_strides, upd_size, idx_buffers, gid); \
-  }
-
-template <typename T, typename IdxT, typename Op, int NIDX>
-METAL_FUNC void scatter_impl(
-    const device T* updates [[buffer(1)]],
-    device mlx_atomic<T>* out [[buffer(2)]],
-    const constant int* upd_shape [[buffer(3)]],
-    const constant size_t* upd_strides [[buffer(4)]],
-    const constant size_t& upd_ndim [[buffer(5)]],
-    const constant size_t& upd_size [[buffer(6)]],
-    const constant int* out_shape [[buffer(7)]],
-    const constant size_t* out_strides [[buffer(8)]],
-    const constant size_t& out_ndim [[buffer(9)]],
-    const constant int* axes [[buffer(10)]],
-    const thread Indices<IdxT, NIDX>& indices,
-    uint2 gid [[thread_position_in_grid]]) {
-  Op op;
-  auto ind_idx = gid.y;
-  auto ind_offset = gid.x;
-
-  size_t out_idx = 0;
-  for (int i = 0; i < NIDX; ++i) {
-    auto idx_loc = elem_to_loc(
-        ind_idx,
-        &indices.shapes[indices.ndim * i],
-        &indices.strides[indices.ndim * i],
-        indices.ndim);
-    auto ax = axes[i];
-    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], out_shape[ax]);
-    out_idx += idx_val * out_strides[ax];
-  }
-
-  if (upd_size > 1) {
-    auto out_offset = elem_to_loc(
-        ind_offset, upd_shape + indices.ndim, out_strides, out_ndim);
-    out_idx += out_offset;
-  }
-
-  auto upd_idx =
-      elem_to_loc(gid.y * upd_size + gid.x, upd_shape, upd_strides, upd_ndim);
-  op.atomic_update(out, updates[upd_idx], out_idx);
-}
-
-#define make_scatter_impl(IDX_ARG, IDX_ARR)                   \
-  template <typename T, typename IdxT, typename Op, int NIDX> \
-  [[kernel]] void scatter(                                    \
-      const device T* updates [[buffer(1)]],                  \
-      device mlx_atomic<T>* out [[buffer(2)]],                \
-      const constant int* upd_shape [[buffer(3)]],            \
-      const constant size_t* upd_strides [[buffer(4)]],       \
-      const constant size_t& upd_ndim [[buffer(5)]],          \
-      const constant size_t& upd_size [[buffer(6)]],          \
-      const constant int* out_shape [[buffer(7)]],            \
-      const constant size_t* out_strides [[buffer(8)]],       \
-      const constant size_t& out_ndim [[buffer(9)]],          \
-      const constant int* axes [[buffer(10)]],                \
-      const constant int* idx_shapes [[buffer(11)]],          \
-      const constant size_t* idx_strides [[buffer(12)]],      \
-      const constant int& idx_ndim [[buffer(13)]],            \
-      IDX_ARG(IdxT) uint2 gid [[thread_position_in_grid]]) {  \
-    Indices<IdxT, NIDX> idxs{                                 \
-        {{IDX_ARR()}}, idx_shapes, idx_strides, idx_ndim};    \
-                                                              \
-    return scatter_impl<T, IdxT, Op, NIDX>(                   \
-        updates,                                              \
-        out,                                                  \
-        upd_shape,                                            \
-        upd_strides,                                          \
-        upd_ndim,                                             \
-        upd_size,                                             \
-        out_shape,                                            \
-        out_strides,                                          \
-        out_ndim,                                             \
-        axes,                                                 \
-        idxs,                                                 \
-        gid);                                                 \
-  }
-
-#define make_scatter(n)                       \
-  make_scatter_impl(IDX_ARG_##n, IDX_ARR_##n) \
-      make_scatter_1d_index(IDX_ARG_##n, IDX_ARR_##n)
-
-make_scatter(0) make_scatter(1) make_scatter(2) make_scatter(3) make_scatter(4)
-    make_scatter(5) make_scatter(6) make_scatter(7) make_scatter(8)
-        make_scatter(9) make_scatter(10)
-
-/////////////////////////////////////////////////////////////////////
-// Scatter instantiations
-/////////////////////////////////////////////////////////////////////
-
-#define instantiate_scatter5(name, src_t, idx_t, op_t, nidx, IDX_ARG) \
-  template [[host_name("scatter" name "_" #nidx)]] [[kernel]] void    \
-  scatter<src_t, idx_t, op_t, nidx>(                                  \
-      const device src_t* updates [[buffer(1)]],                      \
-      device mlx_atomic<src_t>* out [[buffer(2)]],                    \
-      const constant int* upd_shape [[buffer(3)]],                    \
-      const constant size_t* upd_strides [[buffer(4)]],               \
-      const constant size_t& upd_ndim [[buffer(5)]],                  \
-      const constant size_t& upd_size [[buffer(6)]],                  \
-      const constant int* out_shape [[buffer(7)]],                    \
-      const constant size_t* out_strides [[buffer(8)]],               \
-      const constant size_t& out_ndim [[buffer(9)]],                  \
-      const constant int* axes [[buffer(10)]],                        \
-      const constant int* idx_shapes [[buffer(11)]],                  \
-      const constant size_t* idx_strides [[buffer(12)]],              \
-      const constant int& idx_ndim [[buffer(13)]],                    \
-      IDX_ARG(idx_t) uint2 gid [[thread_position_in_grid]]);
-
-#define instantiate_scatter6(name, src_t, idx_t, op_t, nidx, IDX_ARG)       \
-  template [[host_name("scatter_1d_index" name "_" #nidx)]] [[kernel]] void \
-  scatter_1d_index<src_t, idx_t, op_t, nidx>(                               \
-      const device src_t* updates [[buffer(1)]],                            \
-      device mlx_atomic<src_t>* out [[buffer(2)]],                          \
-      const constant int* out_shape [[buffer(3)]],                          \
-      const constant size_t* out_strides [[buffer(4)]],                     \
-      const constant size_t& upd_size [[buffer(5)]],                        \
-      IDX_ARG(idx_t) uint2 gid [[thread_position_in_grid]]);
-
-// clang-format off
-#define instantiate_scatter4(name, src_t, idx_t, op_t, nidx)            \
-  instantiate_scatter5(name, src_t, idx_t, op_t, nidx, IDX_ARG_ ##nidx) \
-  instantiate_scatter6(name, src_t, idx_t, op_t, nidx, IDX_ARG_ ##nidx) // clang-format on
-
-// clang-format off
-// Special case NINDEX=0
-#define instantiate_scatter_nd0(name, type)                      \
-  instantiate_scatter4(#name "none", type, bool, None, 0)        \
-  instantiate_scatter4(#name "_sum", type, bool, Sum<type>, 0)   \
-  instantiate_scatter4(#name "_prod", type, bool, Prod<type>, 0) \
-  instantiate_scatter4(#name "_max", type, bool, Max<type>, 0)   \
-  instantiate_scatter4(#name "_min", type, bool, Min<type>, 0) // clang-format on
-
-// clang-format off
-#define instantiate_scatter3(name, type, ind_type, op_type) \
-  instantiate_scatter4(name, type, ind_type, op_type, 1)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 2)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 3)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 4)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 5)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 6)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 7)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 8)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 9)    \
-  instantiate_scatter4(name, type, ind_type, op_type, 10) // clang-format on
-
-// clang-format off
-#define instantiate_scatter2(name, type, ind_type)               \
-  instantiate_scatter3(name "_none", type, ind_type, None)       \
-  instantiate_scatter3(name "_sum", type, ind_type, Sum<type>)   \
-  instantiate_scatter3(name "_prod", type, ind_type, Prod<type>) \
-  instantiate_scatter3(name "_max", type, ind_type, Max<type>)   \
-  instantiate_scatter3(name "_min", type, ind_type, Min<type>) // clang-format on
-
-// clang-format off
-#define instantiate_scatter(name, type)                \
-  instantiate_scatter2(#name "bool_", type, bool)      \
-  instantiate_scatter2(#name "uint8", type, uint8_t)   \
-  instantiate_scatter2(#name "uint16", type, uint16_t) \
-  instantiate_scatter2(#name "uint32", type, uint32_t) \
-  instantiate_scatter2(#name "uint64", type, uint64_t) \
-  instantiate_scatter2(#name "int8", type, int8_t)     \
-  instantiate_scatter2(#name "int16", type, int16_t)   \
-  instantiate_scatter2(#name "int32", type, int32_t)   \
-  instantiate_scatter2(#name "int64", type, int64_t) // clang-format on
-
-    // clang-format off
-// TODO uint64 and int64 unsupported
-instantiate_scatter_nd0(bool_, bool)
-instantiate_scatter_nd0(uint8, uint8_t)
-instantiate_scatter_nd0(uint16, uint16_t)
-instantiate_scatter_nd0(uint32, uint32_t)
-instantiate_scatter_nd0(int8, int8_t)
-instantiate_scatter_nd0(int16, int16_t)
-instantiate_scatter_nd0(int32, int32_t)
-instantiate_scatter_nd0(float16, half)
-instantiate_scatter_nd0(float32, float)
-instantiate_scatter_nd0(bfloat16, bfloat16_t)
-
-instantiate_scatter(bool_, bool)
-instantiate_scatter(uint8, uint8_t)
-instantiate_scatter(uint16, uint16_t)
-instantiate_scatter(uint32, uint32_t)
-instantiate_scatter(int8, int8_t)
-instantiate_scatter(int16, int16_t)
-instantiate_scatter(int32, int32_t)
-instantiate_scatter(float16, half)
-instantiate_scatter(float32, float)
-instantiate_scatter(bfloat16, bfloat16_t) // clang-format on
--- a/mlx/backend/metal/kernels/softmax.h
+++ b/mlx/backend/metal/kernels/softmax.h
@@ -0,0 +1,190 @@
+// Copyright © 2023-2024 Apple Inc.
+
+template <typename T>
+inline T softmax_exp(T x) {
+  // Softmax doesn't need high precision exponential cause x is gonna be in
+  // (-oo, 0] anyway and subsequently it will be divided by sum(exp(x_i)).
+  return fast::exp(x);
+}
+
+template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
+[[kernel]] void softmax_single_row(
+    const device T* in,
+    device T* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint _lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  int lid = _lid;
+
+  constexpr int SIMD_SIZE = 32;
+
+  threadgroup AccT local_max[SIMD_SIZE];
+  threadgroup AccT local_normalizer[SIMD_SIZE];
+
+  AccT ld[N_READS];
+
+  in += gid * axis_size + lid * N_READS;
+  if (lid * N_READS + N_READS <= axis_size) {
+    for (int i = 0; i < N_READS; i++) {
+      ld[i] = AccT(in[i]);
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      ld[i] = ((lid * N_READS + i) < axis_size) ? AccT(in[i])
+                                                : Limits<AccT>::finite_min;
+    }
+  }
+  if (simd_group_id == 0) {
+    local_max[simd_lane_id] = Limits<AccT>::finite_min;
+    local_normalizer[simd_lane_id] = 0;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // Get the max
+  AccT maxval = Limits<AccT>::finite_min;
+  for (int i = 0; i < N_READS; i++) {
+    maxval = (maxval < ld[i]) ? ld[i] : maxval;
+  }
+  maxval = simd_max(maxval);
+  if (simd_lane_id == 0) {
+    local_max[simd_group_id] = maxval;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (simd_group_id == 0) {
+    maxval = simd_max(local_max[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      local_max[0] = maxval;
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  maxval = local_max[0];
+
+  // Compute exp(x_i - maxval) and store the partial sums in local_normalizer
+  AccT normalizer = 0;
+  for (int i = 0; i < N_READS; i++) {
+    AccT exp_x = softmax_exp(ld[i] - maxval);
+    ld[i] = exp_x;
+    normalizer += exp_x;
+  }
+  normalizer = simd_sum(normalizer);
+  if (simd_lane_id == 0) {
+    local_normalizer[simd_group_id] = normalizer;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (simd_group_id == 0) {
+    normalizer = simd_sum(local_normalizer[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      local_normalizer[0] = normalizer;
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  normalizer = 1 / local_normalizer[0];
+
+  // Normalize and write to the output
+  out += gid * axis_size + lid * N_READS;
+  if (lid * N_READS + N_READS <= axis_size) {
+    for (int i = 0; i < N_READS; i++) {
+      out[i] = T(ld[i] * normalizer);
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      if ((lid * N_READS + i) < axis_size) {
+        out[i] = T(ld[i] * normalizer);
+      }
+    }
+  }
+}
+
+template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
+[[kernel]] void softmax_looped(
+    const device T* in,
+    device T* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  in += gid * axis_size;
+
+  constexpr int SIMD_SIZE = 32;
+
+  threadgroup AccT local_max[SIMD_SIZE];
+  threadgroup AccT local_normalizer[SIMD_SIZE];
+
+  // Get the max and the normalizer in one go
+  AccT prevmax;
+  AccT maxval = Limits<AccT>::finite_min;
+  AccT normalizer = 0;
+  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
+       r++) {
+    int offset = r * lsize * N_READS + lid * N_READS;
+    AccT vals[N_READS];
+    if (offset + N_READS <= axis_size) {
+      for (int i = 0; i < N_READS; i++) {
+        vals[i] = AccT(in[offset + i]);
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        vals[i] = (offset + i < axis_size) ? AccT(in[offset + i])
+                                           : Limits<AccT>::finite_min;
+      }
+    }
+    prevmax = maxval;
+    for (int i = 0; i < N_READS; i++) {
+      maxval = (maxval < vals[i]) ? vals[i] : maxval;
+    }
+    normalizer *= softmax_exp(prevmax - maxval);
+    for (int i = 0; i < N_READS; i++) {
+      normalizer += softmax_exp(vals[i] - maxval);
+    }
+  }
+  // Now we got partial normalizer of N_READS * ceildiv(axis_size, N_READS *
+  // lsize) parts. We need to combine them.
+  //    1. We start by finding the max across simd groups
+  //    2. We then change the partial normalizers to account for a possible
+  //       change in max
+  //    3. We sum all normalizers
+  prevmax = maxval;
+  maxval = simd_max(maxval);
+  normalizer *= softmax_exp(prevmax - maxval);
+  normalizer = simd_sum(normalizer);
+
+  // Now the normalizer and max value is correct for each simdgroup. We write
+  // them shared memory and combine them.
+  prevmax = maxval;
+  if (simd_lane_id == 0) {
+    local_max[simd_group_id] = maxval;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  maxval = simd_max(local_max[simd_lane_id]);
+  normalizer *= softmax_exp(prevmax - maxval);
+  if (simd_lane_id == 0) {
+    local_normalizer[simd_group_id] = normalizer;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  normalizer = simd_sum(local_normalizer[simd_lane_id]);
+  normalizer = 1 / normalizer;
+
+  // Finally given the normalizer and max value we can directly write the
+  // softmax output
+  out += gid * axis_size;
+  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
+       r++) {
+    int offset = r * lsize * N_READS + lid * N_READS;
+    if (offset + N_READS <= axis_size) {
+      for (int i = 0; i < N_READS; i++) {
+        out[offset + i] = T(softmax_exp(in[offset + i] - maxval) * normalizer);
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (offset + i < axis_size) {
+          out[offset + i] =
+              T(softmax_exp(in[offset + i] - maxval) * normalizer);
+        }
+      }
+    }
+  }
+}
--- a/mlx/backend/metal/kernels/softmax.metal
+++ b/mlx/backend/metal/kernels/softmax.metal
@@ -1,205 +1,18 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <metal_common>
 #include <metal_simdgroup>

+using namespace metal;
+
+// clang-format off
 #include "mlx/backend/metal/kernels/bf16.h"
 #include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/kernels/utils.h"
-
-using namespace metal;
-
-template <typename T>
-inline T softmax_exp(T x) {
-  // Softmax doesn't need high precision exponential cause x is gonna be in
-  // (-oo, 0] anyway and subsequently it will be divided by sum(exp(x_i)).
-  return fast::exp(x);
-}
-
-template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
-[[kernel]] void softmax_single_row(
-    const device T* in,
-    device T* out,
-    constant int& axis_size,
-    uint gid [[threadgroup_position_in_grid]],
-    uint _lid [[thread_position_in_threadgroup]],
-    uint simd_lane_id [[thread_index_in_simdgroup]],
-    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
-  int lid = _lid;
-
-  constexpr int SIMD_SIZE = 32;
-
-  threadgroup AccT local_max[SIMD_SIZE];
-  threadgroup AccT local_normalizer[SIMD_SIZE];
-
-  AccT ld[N_READS];
-
-  in += gid * axis_size + lid * N_READS;
-  if (lid * N_READS + N_READS <= axis_size) {
-    for (int i = 0; i < N_READS; i++) {
-      ld[i] = AccT(in[i]);
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      ld[i] = ((lid * N_READS + i) < axis_size) ? AccT(in[i])
-                                                : Limits<AccT>::finite_min;
-    }
-  }
-  if (simd_group_id == 0) {
-    local_max[simd_lane_id] = Limits<AccT>::finite_min;
-    local_normalizer[simd_lane_id] = 0;
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  // Get the max
-  AccT maxval = Limits<AccT>::finite_min;
-  for (int i = 0; i < N_READS; i++) {
-    maxval = (maxval < ld[i]) ? ld[i] : maxval;
-  }
-  maxval = simd_max(maxval);
-  if (simd_lane_id == 0) {
-    local_max[simd_group_id] = maxval;
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  if (simd_group_id == 0) {
-    maxval = simd_max(local_max[simd_lane_id]);
-    if (simd_lane_id == 0) {
-      local_max[0] = maxval;
-    }
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  maxval = local_max[0];
-
-  // Compute exp(x_i - maxval) and store the partial sums in local_normalizer
-  AccT normalizer = 0;
-  for (int i = 0; i < N_READS; i++) {
-    AccT exp_x = softmax_exp(ld[i] - maxval);
-    ld[i] = exp_x;
-    normalizer += exp_x;
-  }
-  normalizer = simd_sum(normalizer);
-  if (simd_lane_id == 0) {
-    local_normalizer[simd_group_id] = normalizer;
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  if (simd_group_id == 0) {
-    normalizer = simd_sum(local_normalizer[simd_lane_id]);
-    if (simd_lane_id == 0) {
-      local_normalizer[0] = normalizer;
-    }
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  normalizer = 1 / local_normalizer[0];
-
-  // Normalize and write to the output
-  out += gid * axis_size + lid * N_READS;
-  if (lid * N_READS + N_READS <= axis_size) {
-    for (int i = 0; i < N_READS; i++) {
-      out[i] = T(ld[i] * normalizer);
-    }
-  } else {
-    for (int i = 0; i < N_READS; i++) {
-      if ((lid * N_READS + i) < axis_size) {
-        out[i] = T(ld[i] * normalizer);
-      }
-    }
-  }
-}
-
-template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
-[[kernel]] void softmax_looped(
-    const device T* in,
-    device T* out,
-    constant int& axis_size,
-    uint gid [[threadgroup_position_in_grid]],
-    uint lid [[thread_position_in_threadgroup]],
-    uint lsize [[threads_per_threadgroup]],
-    uint simd_lane_id [[thread_index_in_simdgroup]],
-    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
-  in += gid * axis_size;
-
-  constexpr int SIMD_SIZE = 32;
-
-  threadgroup AccT local_max[SIMD_SIZE];
-  threadgroup AccT local_normalizer[SIMD_SIZE];
-
-  // Get the max and the normalizer in one go
-  AccT prevmax;
-  AccT maxval = Limits<AccT>::finite_min;
-  AccT normalizer = 0;
-  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
-       r++) {
-    int offset = r * lsize * N_READS + lid * N_READS;
-    AccT vals[N_READS];
-    if (offset + N_READS <= axis_size) {
-      for (int i = 0; i < N_READS; i++) {
-        vals[i] = AccT(in[offset + i]);
-      }
-    } else {
-      for (int i = 0; i < N_READS; i++) {
-        vals[i] = (offset + i < axis_size) ? AccT(in[offset + i])
-                                           : Limits<AccT>::finite_min;
-      }
-    }
-    prevmax = maxval;
-    for (int i = 0; i < N_READS; i++) {
-      maxval = (maxval < vals[i]) ? vals[i] : maxval;
-    }
-    normalizer *= softmax_exp(prevmax - maxval);
-    for (int i = 0; i < N_READS; i++) {
-      normalizer += softmax_exp(vals[i] - maxval);
-    }
-  }
-  // Now we got partial normalizer of N_READS * ceildiv(axis_size, N_READS *
-  // lsize) parts. We need to combine them.
-  //    1. We start by finding the max across simd groups
-  //    2. We then change the partial normalizers to account for a possible
-  //       change in max
-  //    3. We sum all normalizers
-  prevmax = maxval;
-  maxval = simd_max(maxval);
-  normalizer *= softmax_exp(prevmax - maxval);
-  normalizer = simd_sum(normalizer);
-
-  // Now the normalizer and max value is correct for each simdgroup. We write
-  // them shared memory and combine them.
-  prevmax = maxval;
-  if (simd_lane_id == 0) {
-    local_max[simd_group_id] = maxval;
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  maxval = simd_max(local_max[simd_lane_id]);
-  normalizer *= softmax_exp(prevmax - maxval);
-  if (simd_lane_id == 0) {
-    local_normalizer[simd_group_id] = normalizer;
-  }
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  normalizer = simd_sum(local_normalizer[simd_lane_id]);
-  normalizer = 1 / normalizer;
-
-  // Finally given the normalizer and max value we can directly write the
-  // softmax output
-  out += gid * axis_size;
-  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
-       r++) {
-    int offset = r * lsize * N_READS + lid * N_READS;
-    if (offset + N_READS <= axis_size) {
-      for (int i = 0; i < N_READS; i++) {
-        out[offset + i] = T(softmax_exp(in[offset + i] - maxval) * normalizer);
-      }
-    } else {
-      for (int i = 0; i < N_READS; i++) {
-        if (offset + i < axis_size) {
-          out[offset + i] =
-              T(softmax_exp(in[offset + i] - maxval) * normalizer);
-        }
-      }
-    }
-  }
-}
+#include "mlx/backend/metal/kernels/softmax.h"

 #define instantiate_softmax(name, itype)                          \
-  template [[host_name("softmax_" #name)]] [[kernel]] void        \
+  template [[host_name("block_softmax_" #name)]] [[kernel]] void        \
  softmax_single_row<itype>(                                      \
      const device itype* in,                                     \
      device itype* out,                                          \
@@ -208,7 +21,7 @@ template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
      uint _lid [[thread_position_in_threadgroup]],               \
      uint simd_lane_id [[thread_index_in_simdgroup]],            \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);     \
-  template [[host_name("softmax_looped_" #name)]] [[kernel]] void \
+  template [[host_name("looped_softmax_" #name)]] [[kernel]] void \
  softmax_looped<itype>(                                          \
      const device itype* in,                                     \
      device itype* out,                                          \
@@ -220,7 +33,7 @@ template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

 #define instantiate_softmax_precise(name, itype)                          \
-  template [[host_name("softmax_precise_" #name)]] [[kernel]] void        \
+  template [[host_name("block_softmax_precise_" #name)]] [[kernel]] void        \
  softmax_single_row<itype, float>(                                       \
      const device itype* in,                                             \
      device itype* out,                                                  \
@@ -229,7 +42,7 @@ template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
      uint _lid [[thread_position_in_threadgroup]],                       \
      uint simd_lane_id [[thread_index_in_simdgroup]],                    \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);             \
-  template [[host_name("softmax_looped_precise_" #name)]] [[kernel]] void \
+  template [[host_name("looped_softmax_precise_" #name)]] [[kernel]] void \
  softmax_looped<itype, float>(                                           \
      const device itype* in,                                             \
      device itype* out,                                                  \
@@ -240,7 +53,6 @@ template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
      uint simd_lane_id [[thread_index_in_simdgroup]],                    \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

-// clang-format off
 instantiate_softmax(float32, float)
 instantiate_softmax(float16, half)
 instantiate_softmax(bfloat16, bfloat16_t)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	3a1df968cf	Add async all reduce and donation	2024-05-29 22:54:32 -07:00
Awni Hannun	9f9cb7a2ef	version bump (#1154 )	2024-05-23 18:08:08 -07:00
Awni Hannun	7e26fd8032	Option to JIT steel gemm / conv (#1139 )	2024-05-23 18:07:34 -07:00
Jagrit Digani	eab2685c67	Float mask update (#1152 ) * Float mask update * Update CPU impl	2024-05-23 17:20:44 -07:00
Angelos Katharopoulos	50dfb664db	Comms (#1097 ) * Start the communications branch using MPI * Add ops and primitives * Add python bindings for distributed	2024-05-23 17:04:02 -07:00
Awni Hannun	0189ab6ab6	More jitting (#1132 ) * docs + circle min size build * jit scan, arange, softmax * add sort * jit reductions * remove print * fix deps * clean includes / nits	2024-05-23 16:23:44 -07:00
Rifur13	9401507336	Add groups to 2-D convolutions (#1129 ) * Added groups to 2-D convolutions. Only implemented for some specializations. Also fixed 1D grouped convs with different kernel strides and added more tests. * fix channels condition	2024-05-22 20:01:44 -07:00
Awni Hannun	eb8321d863	list based indexing (#1150 )	2024-05-22 15:52:05 -07:00
Abe Leininger	79ef49b2c2	add mx.trace (#1143 ) (#1147 ) * working c++ trace implementation * updated throw + added overloads * added python binding for trace function * pre-commit reformatting * add trace to docs * resolve comments * remove to_stream call	2024-05-22 15:50:27 -07:00
Awni Hannun	e110ca11e2	Fix offset bug for device buffers (#1151 ) * fix bug with large offsets for buffers * add a test * remove test as its too big for small machine	2024-05-22 15:50:05 -07:00
Awni Hannun	226748b3e7	JIT compile option for binary minimization (#1091 ) * try cpp 20 for compile * unary, binary, ternary in jit * nits * fix gather/scatter * fix rebase * reorg compile * add ternary to compile * jit copy * jit compile flag * fix build * use linked function for ternary * some nits * docs + circle min size build * docs + circle min size build * fix extension * fix no cpu build * improve includes	2024-05-22 12:57:13 -07:00
Awni Hannun	d568c7ee36	Rename block sparse (#1149 ) * block_sparse_mm to gather_mm * rename * nit * nit	2024-05-22 07:48:34 -07:00
Awni Hannun	e6fecbb3e1	Some fixes in docs (#1141 ) * fixes in docs * nit	2024-05-20 11:51:47 -07:00
Angelos Katharopoulos	da83f899bb	Improve qvm speed (#1140 )	2024-05-20 09:20:44 -07:00
jlwitthuhn	7e5674d8be	Treate 'minimum' differently in cosine decay (#1138 )	2024-05-20 08:00:48 -07:00
Shixian Sheng	0a558577bf	Update README.md (#1136 )	2024-05-20 06:16:40 -07:00
Awni Hannun	fb71a82ada	Fix copy bug with many dims (#1137 )	2024-05-17 21:10:03 -07:00
Awni Hannun	23406c9e9e	Choose the right MLX bf16 for extensions (#1135 ) * default to custom bf * choose right bf * fix extensions * fix circle conf	2024-05-17 15:09:28 -07:00
Luca Arnaboldi	b3ec792380	Implemented Cholesky on CPU (#1119 )	2024-05-17 12:31:59 -07:00
Awni Hannun	6a9b584f3d	patch bump (#1131 )	2024-05-16 20:51:33 -07:00
Awni Hannun	81dd33af66	allow conversion to dlpack (#1120 )	2024-05-16 16:11:37 -07:00
Awni Hannun	8b76571896	Fix extensions (#1126 ) * fix extensions * title * enable circle * fix nanobind tag * fix bug in doc * try to fix config * typo	2024-05-16 15:36:25 -07:00
Angelos Katharopoulos	e78a6518fa	Block sparse qmm (#1124 )	2024-05-16 15:24:14 -07:00
Awni Hannun	1873ffda01	Detect metal version and propagate correctly for JIT (#1109 ) * detect metal version and propagate correctly for JIT * remove softmax * fix versions	2024-05-15 17:42:09 -07:00
Jacket	c417e42116	[Fix] minor typo in default argument for argpartition's "axis" parameter (#1125 ) According to the document, argpartition's axis parameter can be None, but due to a previous typo it can't really accepts a None value.	2024-05-15 15:25:25 -07:00
Jagrit Digani	358e1fd6ab	Fused GEMM (#1123 ) * Basic gemm working * Update addmm * Clear out steel_gemm and steel_addmm kernels * Fuse and clear out gather gemm * Update objc releases	2024-05-15 10:30:41 -07:00
Awni Hannun	631dfbe673	fix scatter index bug (#1122 )	2024-05-14 15:04:58 -07:00
Cheng	56a4eaed72	Pass missing stream arg in array.flatten (#1111 )	2024-05-14 06:50:16 -07:00
Cheng	bf925d9dc7	Move args in conv_general (#1118 ) Also fix a typo that padding_lo is passed as padding_hi.	2024-05-14 06:50:09 -07:00
Cheng	1a7ed5dcb6	Fill vector with constructor instead of fill_n (#1113 )	2024-05-14 06:28:55 -07:00
Cheng	5be5daa6ef	Use compiled function in Sigmoid module (#1116 )	2024-05-14 06:25:57 -07:00
Cheng	60cb11764e	Use correct module type in quantized.py (#1115 )	2024-05-14 06:25:42 -07:00
Cheng	cbd5445ea7	The tile op does not accept None as reps (#1117 )	2024-05-14 06:25:25 -07:00
Cheng	2c7e9b5158	Add missing docs for some ops (#1110 )	2024-05-14 06:09:05 -07:00
Mike Drob	2263e4b279	Experiment with medium machines for CI (#1000 )	2024-05-13 19:40:19 -07:00
Awni Hannun	863039da4c	Allow scatter type exception to be caught by checking in op (#1077 ) * allow exception to be caught in main thread * only for gpu * more detailed scatter error	2024-05-13 17:43:53 -07:00
Awni Hannun	7178ac0111	No CPU option for binary minimization (#1105 ) * no cpu build option * docs * fix	2024-05-13 16:08:11 -07:00
Ravindra R. Jaju	e7f9710499	Fix typo in a variable name in example code. (#1104 ) * Fix typo in a variable name in example code. * Rename df2dx2 to d2fdx2 - the appropriate naming for the second derivative * Update CONTRIBUTING.md - add needed python packages, and a virtual-env hint * Revert "Fix typo in a variable name in example code." This reverts commit `bc10a17534`. * Rename df2dx2 to d2fdx2	2024-05-13 06:04:23 -07:00
Max-Heinrich Laves	ff4223904d	Conv3d (#993 ) * added conv3d added conv3d implemented explicit_gemm_conv_ND_cpu and bounds checks for slow_conv_3D * incorporated reviewer comments * fixed test * reduced tensor shapes in test for conv3d * Reviewer suggestion Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Reviewer suggestion Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Reviewer suggestion Co-authored-by: Awni Hannun <awni.hannun@gmail.com> Reviewer suggestion	2024-05-11 06:15:02 -07:00
Awni Hannun	a9f80d60f6	improve error messaging in eval (#1101 )	2024-05-10 10:04:07 -07:00
Alex Barron	2e158cf6d0	Add conjugate operator (#1100 ) * cpu and gpu impl * add mx.conj and array.conj() --------- Co-authored-by: Alex Barron <abarron22@apple.com>	2024-05-10 07:22:20 -07:00
Awni Hannun	8bd6bfa4b5	version (#1099 )	2024-05-09 17:52:39 -07:00
Awni Hannun	8b1906abd0	Add compiler flags to disable safetensors and gguf (#1098 ) * with docs * nit	2024-05-09 17:39:44 -07:00
Awni Hannun	06375e6605	Split encoders in non-concurrent context with a max ops per encoder (#1085 ) * split encoders * fix race	2024-05-09 16:21:02 -07:00
Awni Hannun	b21242faf1	Allow unary ops to accept array like (#1093 )	2024-05-09 09:36:02 -07:00
Rahul Yedida	cc05a281c4	Added ArcTan2 operation (#1079 ) * Added ArcTan2 operation * Cleanup, bug fixes from code review * Minor cleanup, fixed Linux tests	2024-05-08 08:35:15 -07:00
Jagrit Digani	fe96ceee66	Update block offset adjustment to be in size_t (#1087 )	2024-05-08 08:10:23 -07:00