Bump the version (#604 )

Reduce vmap + some fixes (#601 )
Fix for AdaDelta (#603 )
2025-09-07 09:14:34 +08:00 · 2024-02-01 11:41:38 -08:00 · 2024-02-01 11:30:28 -08:00 · 2024-02-01 09:56:27 -08:00 · 2024-01-31 12:55:30 -08:00 · 2024-01-31 12:05:42 -08:00
187 changed files with 15607 additions and 4333 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -26,18 +26,28 @@ jobs:
          command: |
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            sudo apt-get update
-            sudo apt-get install libblas-dev
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py develop
      - run:
-          name: Run the python tests
+          name: Generate package stubs
          command: |
-            python3 -m unittest discover python/tests
+            python3 setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            python3 -m unittest discover python/tests -v
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3 -m pip install . 
      - run:
          name: Build CPP only
          command: |
@@ -60,25 +70,47 @@ jobs:
            conda activate runner-env
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            pip install torch
+            pip install tensorflow
            pip install unittest-xml-reporting
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
            eval "$(conda shell.bash hook)"
            conda activate runner-env
            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext --inplace
            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py develop
      - run:
-          name: Run the python tests
+          name: Generate package stubs
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            python setup.py generate_stubs
+      - run:
+          name: Run Python tests
          command: |
            eval "$(conda shell.bash hook)"
            conda activate runner-env
            DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            DEVICE=gpu python -m xmlrunner discover -v python/tests -o test-results/gpu
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       eval "$(conda shell.bash hook)"
+      #       conda activate runner-env
+      #       cd examples/extensions && python -m pip install . 
      - store_test_results:
          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            mkdir -p build && cd build && cmake .. && make -j
+      - run:
+          name: Run CPP tests
+          command: METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests

  build_release:
    machine: true
@@ -101,10 +133,27 @@ jobs:
            conda activate runner-env
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            pip install twine
+      # TODO: Update build system to switch away from setup.py develop
      - run:
-          name: Build package
+          name: Install Python package
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
+              PYPI_RELEASE=1 \
+              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              python setup.py develop
+      - run:
+          name: Generate package stubs
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            python setup.py generate_stubs
+      - run:
+          name: Publish Python package
          command: |
            eval "$(conda shell.bash hook)"
            conda activate runner-env
@@ -137,10 +186,26 @@ jobs:
            conda activate runner-env
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            pip install twine
      - run:
-          name: Build package
+          name: Install Python package
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
+              DEV_RELEASE=1 \
+              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              python setup.py develop
+      - run:
+          name: Generate package stubs
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            python setup.py generate_stubs
+      - run:
+          name: Publish Python package
          command: |
            eval "$(conda shell.bash hook)"
            conda activate runner-env
@@ -173,10 +238,25 @@ jobs:
            conda activate runner-env
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            pip install twine
      - run:
-          name: Build package
+          name: Install Python package
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
+              CMAKE_BUILD_PARALLEL_LEVEL="" \
+              python setup.py develop
+      - run:
+          name: Generate package stubs
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate runner-env
+            python setup.py generate_stubs
+      - run:
+          name: Build package distribution
          command: |
            eval "$(conda shell.bash hook)"
            conda activate runner-env
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,11 +5,11 @@ repos:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 22.10.0
+    rev: 23.12.1
    hooks:
    -   id: black
 -   repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
    hooks:
    -   id: isort
        args:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -7,10 +7,10 @@ with a short description of your contribution(s) below. For example:

 MLX was developed with contributions from the following individuals:

- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops.
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot` and safetensor support
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile` and safetensor support
 - Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer.

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.24)

-project(mlx LANGUAGES CXX)
+project(mlx LANGUAGES C CXX)

 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -18,7 +18,7 @@ option(MLX_BUILD_METAL "Build metal backend" ON)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.0.7)
+  set(MLX_VERSION 0.1.0)
 endif()

 # --------------------- Processor tests -------------------------
@@ -29,9 +29,15 @@ set(MLX_BUILD_ARM OFF)

 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")

-  if (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64")
-    message(WARNING 
-      "Building for x86_64 on macOS is not supported." 
+  if (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64" AND ${CMAKE_HOST_APPLE})
+    message(FATAL_ERROR
+      "Building for x86_64 on macOS is not supported."
+      " If you are on an Apple silicon system, check the build"
+      " documentation for possible fixes: "
+      "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source")
+  elseif (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64")
+    message(WARNING
+      "Building for x86_64 on macOS is not supported."
      " If you are on an Apple silicon system, "
      " make sure you are building for arm64.")
  elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm64")
@@ -69,7 +75,7 @@ elseif (MLX_BUILD_METAL)
                  COMMAND_ERROR_IS_FATAL ANY)

  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
-  
+
  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
@@ -98,15 +104,6 @@ elseif (MLX_BUILD_METAL)
    ${QUARTZ_LIB})
 endif()

-MESSAGE(STATUS "Downloading json")
-FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
-FetchContent_MakeAvailable(json)
-target_include_directories(
-    mlx PUBLIC
-    $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>
-    $<INSTALL_INTERFACE:include/json>
-)
-
 find_library(ACCELERATE_LIBRARY Accelerate)
 if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
  message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
@@ -126,16 +123,27 @@ else()
    /usr/include
    /usr/local/include
    $ENV{BLAS_HOME}/include)
-  message(STATUS ${BLAS_LIBRARIES})
-  message(STATUS ${BLAS_INCLUDE_DIRS})
+  message(STATUS "Blas lib" ${BLAS_LIBRARIES})
+  message(STATUS "Blas incclude" ${BLAS_INCLUDE_DIRS})
  target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
  target_link_libraries(mlx ${BLAS_LIBRARIES})
+  find_package(LAPACK REQUIRED)
+  if (NOT LAPACK_FOUND)
+      message(FATAL_ERROR "Must have LAPACK installed")
+  endif()
+  find_path(LAPACK_INCLUDE_DIRS lapacke.h
+    /usr/include
+    /usr/local/include)
+  message(STATUS "Lapack lib" ${LAPACK_LIBRARIES})
+  message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+  target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+  target_link_libraries(mlx ${LAPACK_LIBRARIES})
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

 target_include_directories(
-  mlx 
+  mlx
  PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
  $<INSTALL_INTERFACE:include>
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 include CMakeLists.txt
 recursive-include mlx/ *
 include python/src/*
+python/mlx/py.typed # support type hinting as in PEP-561
--- a/README.md
+++ b/README.md
@@ -61,17 +61,25 @@ variety of examples, including:
 ## Quickstart

 See the [quick start
-guide](https://ml-explore.github.io/mlx/build/html/quick_start.html)
+guide](https://ml-explore.github.io/mlx/build/html/usage/quick_start.html)
 in the documentation.

 ## Installation

 MLX is available on [PyPI](https://pypi.org/project/mlx/). To install the Python API, run:

+**With `pip`**:
+
 ```
 pip install mlx
 ```

+**With `conda`**:
+
+```
+conda install -c conda-forge mlx
+```
+
 Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -233,6 +233,20 @@ void time_gather_scatter() {
  TIME(single_element_add);
 }

+void time_divmod() {
+  auto a = random::normal({1000});
+  auto b = random::normal({1000});
+  eval({a, b});
+
+  auto divmod_fused = [&a, &b]() { return divmod(a, b); };
+  TIME(divmod_fused);
+
+  auto divmod_separate = [&a, &b]() {
+    return std::vector<array>{floor_divide(a, b), remainder(a, b)};
+  };
+  TIME(divmod_separate);
+}
+
 int main() {
  std::cout << "Benchmarks for " << default_device() << std::endl;
  time_creation_ops();
@@ -246,4 +260,5 @@ int main() {
  time_matmul();
  time_reductions();
  time_gather_scatter();
+  time_divmod();
 }
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -166,13 +166,13 @@ if __name__ == "__main__":
    dtypes = ("float32", "float16")
    transposes = ("nn", "nt", "tn")
    shapes = (
+        (16, 234, 768, 3072),
+        (1, 64, 64, 25344),
        (16, 1024, 1024, 1024),
        (1, 1024, 1024, 2048),
        (4, 1024, 1024, 4096),
        (4, 1024, 4096, 1024),
        (1, 4096, 4096, 4096),
-        (15, 1023, 1023, 1023),
-        (17, 1025, 1025, 1025),
    )

    for dtype in dtypes:
--- a/benchmarks/python/comparative/bench_mlx.py
+++ b/benchmarks/python/comparative/bench_mlx.py
@@ -60,20 +60,60 @@ def matmul(x, y):
    mx.eval(ys)


-def _quant_matmul(x, w, s, b, group_size, bits):
+def _quant_matmul(x, w, s, b, transpose, group_size, bits):
    ys = []
    for i in range(10):
-        ys.append(mx.quantized_matmul(x, w, s, b, group_size=group_size, bits=bits))
+        ys.append(
+            mx.quantized_matmul(
+                x, w, s, b, transpose=transpose, group_size=group_size, bits=bits
+            )
+        )
    mx.eval(ys)


 quant_matmul = {
-    "quant_matmul_64_2": partial(_quant_matmul, group_size=64, bits=2),
-    "quant_matmul_64_4": partial(_quant_matmul, group_size=64, bits=4),
-    "quant_matmul_64_8": partial(_quant_matmul, group_size=64, bits=8),
-    "quant_matmul_128_2": partial(_quant_matmul, group_size=128, bits=2),
-    "quant_matmul_128_4": partial(_quant_matmul, group_size=128, bits=4),
-    "quant_matmul_128_8": partial(_quant_matmul, group_size=128, bits=8),
+    "quant_matmul_32_2": partial(_quant_matmul, transpose=False, group_size=32, bits=2),
+    "quant_matmul_32_4": partial(_quant_matmul, transpose=False, group_size=32, bits=4),
+    "quant_matmul_32_8": partial(_quant_matmul, transpose=False, group_size=32, bits=8),
+    "quant_matmul_64_2": partial(_quant_matmul, transpose=False, group_size=64, bits=2),
+    "quant_matmul_64_4": partial(_quant_matmul, transpose=False, group_size=64, bits=4),
+    "quant_matmul_64_8": partial(_quant_matmul, transpose=False, group_size=64, bits=8),
+    "quant_matmul_128_2": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=2
+    ),
+    "quant_matmul_128_4": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=4
+    ),
+    "quant_matmul_128_8": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=8
+    ),
+    "quant_matmul_t_32_2": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=2
+    ),
+    "quant_matmul_t_32_4": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=4
+    ),
+    "quant_matmul_t_32_8": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=8
+    ),
+    "quant_matmul_t_64_2": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=2
+    ),
+    "quant_matmul_t_64_4": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=4
+    ),
+    "quant_matmul_t_64_8": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=8
+    ),
+    "quant_matmul_t_128_2": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=2
+    ),
+    "quant_matmul_t_128_4": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=4
+    ),
+    "quant_matmul_t_128_8": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=8
+    ),
 }


@@ -229,6 +269,13 @@ def linear(w, b, x):
    mx.eval(ys)


+def linear_fused(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.addmm(b, x, mx.transpose(w, (1, 0))))
+    mx.eval(ys)
+
+
 def rope(x):
    *_, N, D = x.shape
    ys = []
@@ -369,7 +416,10 @@ if __name__ == "__main__":
        print(bench(quant_matmul[args.benchmark], *xs))

    elif args.benchmark == "linear":
-        print(bench(linear, *xs))
+        if args.fused:
+            print(bench(linear_fused, *xs))
+        else:
+            print(bench(linear, *xs))

    elif args.benchmark == "sum_axis":
        print(bench(reduction, "sum", axis, x))
--- a/benchmarks/python/llama_jax_bench.py
+++ b/benchmarks/python/llama_jax_bench.py
@@ -1,198 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import jax
-import jax.numpy as jnp
-from flax import linen as nn
-
-
-class RoPE(nn.Module):
-    dims: int
-    traditional: bool = False
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = jnp.concatenate([rx1, rx2, x[..., self.dims :]], axis=-1)
-        else:
-            rx = jnp.concatenate([rx1, rx2], axis=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = jnp.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
-
-        return rx
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        dtype=jnp.float32,
-    ):
-        D = D // 2
-        positions = jnp.arange(offset, N, dtype=dtype)
-        freqs = jnp.exp(-jnp.arange(0, D, dtype=dtype) * (math.log(base) / D))
-        theta = positions.reshape((-1, 1)) * freqs.reshape((1, -1))
-        costheta = jnp.cos(theta)
-        sintheta = jnp.sin(theta)
-
-        return costheta, sintheta
-
-    @nn.compact
-    def __call__(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.reshape((-1, shape[-2], shape[-1]))
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.reshape(shape)
-
-
-class LlamaAttention(nn.Module):
-    dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        num_heads = self.num_heads
-        dims = self.dims
-
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.key_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.value_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.out_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        keys = keys.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        values = values.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = jnp.concatenate([key_cache, keys], axis=2)
-            values = jnp.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.transpose((0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = jax.nn.softmax(scores, axis=-1)
-        values_hat = (scores @ values).transpose((0, 2, 1, 3)).reshape((B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    dims: int
-    mlp_dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        dims = self.dims
-        mlp_dims = self.mlp_dims
-        num_heads = self.num_heads
-
-        self.attention = LlamaAttention(dims, num_heads, dtype)
-
-        self.norm1 = nn.RMSNorm(param_dtype=self.dtype)
-        self.norm2 = nn.RMSNorm(param_dtype=self.dtype)
-
-        self.linear1 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear2 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear3 = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = jax.nn.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    dtype = jnp.float16
-
-    k1, k2, k3, k4 = jax.random.split(jax.random.PRNGKey(0), 4)
-
-    x = jax.random.normal(k1, (1, 1, D), dtype)
-    cache = [
-        jax.random.normal(k2, [1, H, C, D // H], dtype),
-        jax.random.normal(k3, [1, H, C, D // H], dtype),
-    ]
-
-    layer = LlamaEncoderLayer(D, F, H, dtype=dtype)
-    params = layer.init(k4, x, mask=None, cache=cache)["params"]
-
-    @jax.jit
-    def model_fn(x, mask, cache):
-        return layer.apply({"params": params}, x, mask=mask, cache=cache)
-
-    T = measure(model_fn, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_mlx_bench.py
+++ b/benchmarks/python/llama_mlx_bench.py
@@ -1,118 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.utils
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = nn.RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, False)
-        self.key_proj = nn.Linear(dims, dims, False)
-        self.value_proj = nn.Linear(dims, dims, False)
-        self.out_proj = nn.Linear(dims, dims, False)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = mx.transpose(mx.reshape(queries, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        keys = mx.transpose(mx.reshape(keys, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        values = mx.transpose(mx.reshape(values, (B, L, num_heads, -1)), (0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = mx.concatenate([key_cache, keys], axis=2)
-            values = mx.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = mx.array(math.sqrt(1 / queries.shape[-1]), dtype=queries.dtype)
-        scores = (queries * scale) @ mx.transpose(keys, (0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = mx.softmax(scores, axis=-1)
-        values_hat = mx.reshape(mx.transpose(scores @ values, (0, 2, 1, 3)), (B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = nn.RMSNorm(dims)
-        self.norm2 = nn.RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, False)
-        self.linear2 = nn.Linear(dims, mlp_dims, False)
-        self.linear3 = nn.Linear(mlp_dims, dims, False)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = a * mx.sigmoid(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        mx.eval(y, c)
-
-    start = time.time()
-    rs = []
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        rs.append((y, c))
-    mx.eval(rs)
-    end = time.time()
-
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    mx.set_default_device(mx.gpu)
-    dtype = mx.float16
-
-    layer = LlamaEncoderLayer(D, F, H)
-    layer.update(mlx.utils.tree_map(lambda x: x.astype(dtype), layer.parameters()))
-    k1, k2, k3 = mx.random.split(mx.random.key(0), 3)
-    x = mx.random.normal([1, 1, D], dtype=dtype)
-    cache = [
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-    ]
-    mx.eval(x, cache)
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_torch_bench.py
+++ b/benchmarks/python/llama_torch_bench.py
@@ -1,199 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import torch
-import torch.mps
-import torch.nn as nn
-
-
-def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
-        torch.mps.synchronize()
-
-
-class RoPE(nn.Module):
-    def __init__(self, dims: int, traditional: bool = False):
-        super().__init__()
-        self.dims = dims
-        self.traditional = traditional
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = torch.cat([rx1, rx2, x[..., self.dims :]], dim=-1)
-        else:
-            rx = torch.cat([rx1, rx2], dim=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
-
-        return rx
-
-    def forward(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.view(-1, shape[-2], shape[-1])
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, device=x.device, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.view(*shape)
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        device="cpu",
-        dtype=torch.float32,
-    ):
-        D = D // 2
-        positions = torch.arange(offset, N, dtype=dtype, device=device)
-        freqs = torch.exp(
-            -torch.arange(0, D, dtype=dtype, device=device) * (math.log(base) / D)
-        )
-        theta = positions.view(-1, 1) * freqs.view(1, -1)
-        costheta = torch.cos(theta)
-        sintheta = torch.sin(theta)
-
-        return costheta, sintheta
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dims: int, epsilon: float = 1e-6):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones((dims,)))
-        self.epsilon = epsilon
-
-    def forward(self, x):
-        n = torch.rsqrt(x.square().mean(dim=-1, keepdims=True) + self.epsilon)
-        return self.gamma * x * n
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, bias=False)
-        self.key_proj = nn.Linear(dims, dims, bias=False)
-        self.value_proj = nn.Linear(dims, dims, bias=False)
-        self.out_proj = nn.Linear(dims, dims, bias=False)
-
-    def forward(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        keys = keys.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        values = values.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = torch.cat([key_cache, keys], dim=2)
-            values = torch.cat([value_cache, values], dim=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.permute(0, 1, 3, 2)
-        if mask is not None:
-            scores = scores + mask
-        scores = torch.softmax(scores, dim=-1)
-        values_hat = (scores @ values).permute(0, 2, 1, 3).reshape(B, L, -1)
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = RMSNorm(dims)
-        self.norm2 = RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear2 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear3 = nn.Linear(mlp_dims, dims, bias=False)
-
-    def forward(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = torch.nn.functional.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-@torch.no_grad()
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    device = torch.device("mps")
-    dtype = torch.float16
-
-    layer = LlamaEncoderLayer(D, F, H).to(device).to(dtype)
-    x = torch.randn(1, 1, D).to(device).to(dtype)
-    cache = [
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-    ]
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -44,6 +44,13 @@ def time_matmul():
    time_fn(mx.matmul, a, b)


+def time_maximum():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    b = mx.random.uniform(shape=(32, 1024, 1024))
+    mx.eval(a, b)
+    time_fn(mx.maximum, a, b)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -101,6 +108,7 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_maximum()
    time_exp()
    time_negative()
    time_logsumexp()
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -5,13 +5,15 @@
 import os
 import subprocess

+import mlx.core as mx
+
 # -- Project information -----------------------------------------------------

 project = "MLX"
 copyright = "2023, MLX Contributors"
 author = "MLX Contributors"
-version = "0.0.7"
-release = "0.0.7"
+version = ".".join(mx.__version__.split(".")[:3])
+release = version

 # -- General configuration ---------------------------------------------------

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -929,7 +929,7 @@ We see some modest improvements right away!

 This operation is now good to be used to build other operations, 
 in :class:`mlx.nn.Module` calls, and also as a part of graph 
-transformations such as :meth:`grad` and :meth:`simplify`!
+transformations like :meth:`grad`!

 Scripts
 -------
--- a/docs/src/examples/llama-inference.rst
+++ b/docs/src/examples/llama-inference.rst
@@ -371,7 +371,7 @@ Scripts

   The full example code is available in `mlx-examples`_.

-.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llama
+.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama

 .. [1] Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B. and Liu, Y., 2021.
   Roformer: Enhanced transformer with rotary position embedding. arXiv
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -35,9 +35,14 @@ are the CPU and GPU.
   :caption: Usage 
   :maxdepth: 1

-   quick_start
-   unified_memory
-   using_streams
+   usage/quick_start
+   usage/lazy_evaluation
+   usage/unified_memory
+   usage/indexing
+   usage/saving_and_loading
+   usage/function_transforms
+   usage/numpy
+   usage/using_streams

 .. toctree::
   :caption: Examples
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,8 +1,8 @@
 Build and Install
 =================

-Install from PyPI
-----------------
+Python Installation
+-------------------

 MLX is available on PyPI. All you have to do to use MLX with your own Apple
 silicon computer is
@@ -21,6 +21,14 @@ To install from PyPI you must meet the following requirements:
    MLX is only available on devices running macOS >= 13.3 
    It is highly recommended to use macOS 14 (Sonoma)

+
+MLX is also available on conda-forge. To install MLX with conda do:
+
+.. code-block:: shell
+
+   conda install conda-forge::mlx
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

@@ -48,6 +56,9 @@ Build Requirements
 - `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
 - Xcode >= 14.3 (Xcode >= 15.0 for macOS 14 and above)

+.. note::
+   Ensure your shell environment is native ``arm``, not ``x86`` via Rosetta. If
+   the output of ``uname -p`` is ``x86``, see the :ref:`troubleshooting section <build shell>` below.

 Python API
 ^^^^^^^^^^
@@ -169,6 +180,7 @@ should point to the path to the built metal library.
 Troubleshooting
 ^^^^^^^^^^^^^^^

+
 Metal not found
 ~~~~~~~~~~~~~~~

@@ -189,3 +201,34 @@ Then set the active developer directory:
 .. code-block:: shell

  sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
+
+x86 Shell 
+~~~~~~~~~
+
+.. _build shell:
+
+If the ouptut of ``uname -p``  is ``x86`` then your shell is running as x86 via
+Rosetta instead of natively.
+
+To fix this, find the application in Finder (``/Applications`` for iTerm,
+``/Applications/Utilities`` for Terminal), right-click, and click “Get Info”.
+Uncheck “Open using Rosetta”, close the “Get Info” window, and restart your
+terminal.
+
+Verify the terminal is now running natively the following command:
+
+.. code-block:: shell
+
+  $ uname -p
+  arm
+
+Also check that cmake is using the correct architecture:
+
+.. code-block:: shell
+
+  $ cmake --system-information | grep CMAKE_HOST_SYSTEM_PROCESSOR
+  CMAKE_HOST_SYSTEM_PROCESSOR "arm64"
+
+If you see ``"x86_64"``, try re-installing ``cmake``. If you see ``"arm64"``
+but the build errors out with "Building for x86_64 on macOS is not supported."
+wipe your build cahce with ``rm -rf build/`` and try again.
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -29,9 +29,9 @@ The default floating point type is ``float32`` and the default integer type is
   * - ``uint32``
     - 4 
     - 32-bit unsigned integer 
-   * - ``uint32``
+   * - ``uint64``
     - 8 
-     - 32-bit unsigned integer 
+     - 64-bit unsigned integer 
   * - ``int8``
     - 1 
     - 8-bit signed integer 
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -9,3 +9,4 @@ Linear Algebra
   :toctree: _autosummary 

    norm
+    qr
--- a/docs/src/python/nn.rst
+++ b/docs/src/python/nn.rst
@@ -180,3 +180,4 @@ In detail:
   nn/layers
   nn/functions
   nn/losses
+   nn/init
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -15,9 +15,10 @@ simple functions.
   gelu
   gelu_approx
   gelu_fast_approx
-   relu
+   mish
   prelu
+   relu
+   selu
+   softshrink
   silu
   step
-   selu
-   mish
--- a/docs/src/python/nn/init.rst
+++ b/docs/src/python/nn/init.rst
@@ -0,0 +1,45 @@
+.. _init:
+
+.. currentmodule:: mlx.nn.init
+
+Initializers
+------------
+
+The ``mlx.nn.init`` package contains commonly used initializers for neural
+network parameters. Initializers return a function which can be applied to any
+input :obj:`mlx.core.array` to produce an initialized output.
+
+For example:
+
+.. code:: python
+
+   import mlx.core as mx
+   import mlx.nn as nn
+
+   init_fn = nn.init.uniform()
+
+   # Produces a [2, 2] uniform matrix
+   param = init_fn(mx.zeros((2, 2)))
+
+To re-initialize all the parameter in an :obj:`mlx.nn.Module` from say a uniform 
+distribution, you can do:
+
+.. code:: python
+  
+   import mlx.nn as nn
+   model = nn.Sequential(nn.Linear(5, 10), nn.ReLU(), nn.Linear(10, 5))
+   init_fn = nn.init.uniform(low=-0.1, high=0.1)
+   model.apply(init_fn)
+   
+
+.. autosummary::
+   :toctree: _autosummary
+
+   constant
+   normal
+   uniform
+   identity
+   glorot_normal
+   glorot_uniform
+   he_normal
+   he_uniform
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -9,29 +9,30 @@ Layers
   :toctree: _autosummary
   :template: nn-module-template.rst

-   Sequential
-   ReLU
-   PReLU
-   GELU
-   SiLU
-   Step
-   SELU
-   Mish
-   Embedding
-   Linear
-   QuantizedLinear
+   ALiBi
+   BatchNorm
   Conv1d
   Conv2d
-   BatchNorm
-   LayerNorm
-   RMSNorm
-   GroupNorm
-   InstanceNorm
   Dropout
   Dropout2d
   Dropout3d
-   Transformer
+   Embedding
+   GELU
+   GroupNorm
+   InstanceNorm
+   LayerNorm
+   Linear
+   Mish
   MultiHeadAttention
-   ALiBi
+   PReLU
+   QuantizedLinear
+   RMSNorm
+   ReLU
   RoPE
+   SELU
+   Sequential
+   SiLU
   SinusoidalPositionalEncoding
+   Softshrink
+   Step
+   Transformer
--- a/docs/src/python/nn/losses.rst
+++ b/docs/src/python/nn/losses.rst
@@ -10,13 +10,15 @@ Loss Functions
   :template: nn-module-template.rst

   binary_cross_entropy
+   cosine_similarity_loss
   cross_entropy
+   gaussian_nll_loss
+   hinge_loss
+   huber_loss
   kl_div_loss
   l1_loss
+   log_cosh_loss
   mse_loss
   nll_loss
   smooth_l1_loss
-   triplet_loss
-   hinge_loss
-   huber_loss
-   log_cosh_loss
+   triplet_loss
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -35,7 +35,10 @@ Operations
   cos
   cosh
   dequantize
+   diag
+   diagonal
   divide
+   divmod
   equal
   erf
   erfinv
@@ -49,6 +52,11 @@ Operations
   greater
   greater_equal
   identity
+   inner
+   isnan
+   isposinf
+   isneginf
+   isinf
   less
   less_equal
   linspace
@@ -59,6 +67,8 @@ Operations
   log1p
   logaddexp
   logical_not
+   logical_and
+   logical_or
   logsumexp
   matmul
   max
@@ -71,6 +81,7 @@ Operations
   negative
   ones
   ones_like
+   outer
   partition
   pad
   prod
@@ -84,6 +95,7 @@ Operations
   save
   savez
   savez_compressed
+   save_gguf
   save_safetensors
   sigmoid
   sign
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -40,6 +40,7 @@ model's parameters and the **optimizer state**.
   SGD
   RMSprop
   Adagrad
+   Adafactor
   AdaDelta
   Adam
   AdamW
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -33,13 +33,13 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
 .. autosummary:: 
  :toctree: _autosummary

-   seed
-   key
-   split
   bernoulli
   categorical
   gumbel
+   key
   normal
   randint
-   uniform
+   seed
+   split
   truncated_normal
+   uniform
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -14,4 +14,3 @@ Transforms
   jvp
   vjp
   vmap
-   simplify
--- a/docs/src/usage/function_transforms.rst
+++ b/docs/src/usage/function_transforms.rst
@@ -0,0 +1,188 @@
+.. _function_transforms:
+
+Function Transforms
+===================
+
+.. currentmodule:: mlx.core
+
+MLX uses composable function transformations for automatic differentiation and
+vectorization. The key idea behind composable function transformations is that
+every transformation returns a function which can be further transformed. 
+
+Here is a simple example:
+
+.. code-block:: shell
+
+   >>> dfdx = mx.grad(mx.sin)
+   >>> dfdx(mx.array(mx.pi))
+   array(-1, dtype=float32)
+   >>> mx.cos(mx.array(mx.pi))
+   array(-1, dtype=float32)
+
+
+The output of :func:`grad` on :func:`sin` is simply another function. In this
+case it is the gradient of the sine function which is exactly the cosine
+function. To get the second derivative you can do: 
+
+.. code-block:: shell
+
+   >>> d2fdx2 = mx.grad(mx.grad(mx.sin))
+   >>> d2fdx2(mx.array(mx.pi / 2))
+   array(-1, dtype=float32)
+   >>> mx.sin(mx.array(mx.pi / 2))
+   array(1, dtype=float32)
+
+Using :func:`grad` on the output of :func:`grad` is always ok. You keep
+getting higher order derivatives.
+
+Any of the MLX function transformations can be composed in any order to any
+depth. To see the complete list of function transformations check-out the
+:ref:`API documentation <transforms>`. See the following sections for more
+information on :ref:`automatic differentiaion <auto diff>` and
+:ref:`automatic vectorization <vmap>`.
+
+Automatic Differentiation
+-------------------------
+
+.. _auto diff:
+
+Automatic differentiation in MLX works on functions rather than on implicit
+graphs. 
+
+.. note::
+
+   If you are coming to MLX from PyTorch, you no longer need functions like
+   ``backward``, ``zero_grad``, and ``detach``, or properties like
+   ``requires_grad``.
+
+The most basic example is taking the gradient of a scalar-valued function as we
+saw above. You can use the :func:`grad` and :func:`value_and_grad` function to
+compute gradients of more complex functions. By default these functions compute
+the gradient with respect to the first argument:
+
+.. code-block:: python
+
+   def loss_fn(w, x, y):
+      return mx.mean(mx.square(w * x - y))
+
+   w = mx.array(1.0)
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to w:
+   grad_fn = mx.grad(loss_fn)
+   dloss_dw = grad_fn(w, x, y)
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+   # To get the gradient with respect to x we can do:
+   grad_fn = mx.grad(loss_fn, argnums=1)
+   dloss_dx = grad_fn(w, x, y)
+   # Prints array([-1, 1], dtype=float32)
+   print(dloss_dx)
+
+
+One way to get the loss and gradient is to call ``loss_fn`` followed by
+``grad_fn``, but this can result in a lot of redundant work. Instead, you
+should use :func:`value_and_grad`. Continuing the above example:
+
+
+.. code-block:: python
+
+   # Computes the gradient of loss_fn with respect to w:
+   loss_and_grad_fn = mx.value_and_grad(loss_fn)
+   loss, dloss_dw = loss_and_grad_fn(w, x, y)
+
+   # Prints array(1, dtype=float32)
+   print(loss)
+
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+
+You can also take the gradient with respect to arbitrarily nested Python
+containers of arrays (specifically any of :obj:`list`, :obj:`tuple`, or
+:obj:`dict`).
+
+Suppose we wanted a weight and a bias parameter in the above example. A nice
+way to do that is the following:
+
+.. code-block:: python
+
+   def loss_fn(params, x, y):
+      w, b = params["weight"], params["bias"]
+      h = w * x + b 
+      return mx.mean(mx.square(h - y))
+
+   params = {"weight": mx.array(1.0), "bias": mx.array(0.0)}
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to both the
+   # weight and bias:
+   grad_fn = mx.grad(loss_fn)
+   grads = grad_fn(params, x, y)
+
+   # Prints
+   # {'weight': array(-1, dtype=float32), 'bias': array(0, dtype=float32)}
+   print(grads)
+
+Notice the tree structure of the parameters is preserved in the gradients.
+
+In some cases you may want to stop gradients from propagating through a 
+part of the function. You can use the :func:`stop_gradient` for that.
+
+
+Automatic Vectorization
+-----------------------
+
+.. _vmap:
+
+Use :func:`vmap` to automate vectorizing complex functions. Here we'll go
+through a basic and contrived example for the sake of clarity, but :func:`vmap`
+can be quite powerful for more complex functions which are difficult to optimize
+by hand.
+
+.. warning::
+
+   Some operations are not yet supported with :func:`vmap`. If you encounter an error
+   like: ``ValueError: Primitive's vmap not implemented.`` file an `issue
+   <https://github.com/ml-explore/mlx/issues>`_ and include your function.
+   We will prioritize including it.
+
+A naive way to add the elements from two sets of vectors is with a loop:
+
+.. code-block:: python
+
+  xs = mx.random.uniform(shape=(4096, 100))
+  ys = mx.random.uniform(shape=(100, 4096))
+
+  def naive_add(xs, ys):
+      return [xs[i] + ys[:, i] for i in range(xs.shape[1])]
+
+Instead you can use :func:`vmap` to automatically vectorize the addition:
+
+.. code-block:: python
+   
+   # Vectorize over the second dimension of x and the
+   # first dimension of y
+   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(1, 0))
+
+The ``in_axes`` parameter can be used to specify which dimensions of the
+corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
+where the vectorized axes should be in the outputs. 
+
+Let's time these two different versions:
+
+.. code-block:: python
+
+  import timeit
+
+  print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
+  print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))
+
+On an M1 Max the naive version takes in total ``0.390`` seconds whereas the
+vectorized version takes only ``0.025`` seconds, more than ten times faster.
+
+Of course, this operation is quite contrived. A better approach is to simply do
+``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -0,0 +1,123 @@
+.. _indexing:
+
+Indexing Arrays
+===============
+
+.. currentmodule:: mlx.core
+
+For the most part, indexing an MLX :obj:`array` works the same as indexing a
+NumPy :obj:`numpy.ndarray`. See the `NumPy documentation
+<https://numpy.org/doc/stable/user/basics.indexing.html>`_ for more details on
+how that works.
+
+For example, you can use regular integers and slices (:obj:`slice`) to index arrays:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> arr[3]
+  array(3, dtype=int32)
+  >>> arr[-2]  # negative indexing works
+  array(8, dtype=int32)
+  >>> arr[2:8:2] # start, stop, stride
+  array([2, 4, 6], dtype=int32)
+
+For multi-dimensional arrays, the ``...`` or :obj:`Ellipsis` syntax works as in NumPy:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8).reshape(2, 2, 2)
+  >>> arr[:, :, 0]
+  array(3, dtype=int32)
+  array([[0, 2],
+         [4, 6]], dtype=int32
+  >>> arr[..., 0]
+  array([[0, 2],
+         [4, 6]], dtype=int32
+
+You can index with ``None`` to create a new axis:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8)
+  >>> arr.shape
+  [8]
+  >>> arr[None].shape
+  [1, 8]
+
+
+You can also use an :obj:`array` to index another :obj:`array`:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> idx = mx.array([5, 7]) 
+  >>> arr[idx]
+  array([5, 7], dtype=int32)
+
+Mixing and matching integers, :obj:`slice`, ``...``, and :obj:`array` indices
+works just as in NumPy.
+
+Other functions which may be useful for indexing arrays are :func:`take` and
+:func:`take_along_axis`.
+
+Differences from NumPy
+----------------------
+
+.. Note::
+
+  MLX indexing is different from NumPy indexing in two important ways:
+
+  * Indexing does not perform bounds checking. Indexing out of bounds is
+    undefined behavior.
+  * Boolean mask based indexing is not yet supported.
+
+The reason for the lack of bounds checking is that exceptions cannot propagate
+from the GPU. Performing bounds checking for array indices before launching the
+kernel would be extremely inefficient.
+
+Indexing with boolean masks is something that MLX may support in the future. In
+general, MLX has limited support for operations for which outputs
+*shapes* are dependent on input *data*. Other examples of these types of
+operations which MLX does not yet support include :func:`numpy.nonzero` and the
+single input version of :func:`numpy.where`.
+
+In Place Updates 
+----------------
+
+In place updates to indexed arrays are possible in MLX. For example:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[2] = 0
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Just as in NumPy, in place updates will be reflected in all references to the
+same array:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> b = a
+  >>> b[2] = 0
+  >>> b
+  array([1, 2, 0], dtype=int32)
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Transformations of functions which use in-place updates are allowed and work as
+expected. For example:
+
+.. code-block:: python
+
+   def fun(x, idx):
+       x[idx] = 2.0
+       return x.sum()
+
+   dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0]), mx.array([1]))
+   print(dfdx)  # Prints: array([1, 0, 1], dtype=float32)
+
+In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
+and ones elsewhere.
--- a/docs/src/usage/lazy_evaluation.rst
+++ b/docs/src/usage/lazy_evaluation.rst
@@ -0,0 +1,144 @@
+.. _lazy eval:
+
+Lazy Evaluation
+===============
+
+.. currentmodule:: mlx.core
+
+Why Lazy Evaluation
+-------------------
+
+When you perform operations in MLX, no computation actually happens. Instead a
+compute graph is recorded. The actual computation only happens if an
+:func:`eval` is performed.
+
+MLX uses lazy evaluation because it has some nice features, some of which we
+describe below. 
+
+Transforming Compute Graphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Lazy evaluation let's us record a compute graph without actually doing any
+computations. This is useful for function transformations like :func:`grad` and
+:func:`vmap` and graph optimizations.
+
+Currently, MLX does not compile and rerun compute graphs. They are all
+generated dynamically. However, lazy evaluation makes it much easier to
+integrate compilation for future performance enhancements.
+
+Only Compute What You Use
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In MLX you do not need to worry as much about computing outputs that are never
+used. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      a = fun1(x)
+      b = expensive_fun(a)
+      return a, b
+
+  y, _ = fun(x)
+
+Here, we never actually compute the output of ``expensive_fun``. Use this
+pattern with care though, as the graph of ``expensive_fun`` is still built, and
+that has some cost associated to it.
+
+Similarly, lazy evaluation can be beneficial for saving memory while keeping
+code simple. Say you have a very large model ``Model`` derived from
+:obj:`mlx.nn.Module`. You can instantiate this model with ``model = Model()``.
+Typically, this will initialize all of the weights as ``float32``, but the
+initialization does not actually compute anything until you perform an
+:func:`eval`. If you update the model with ``float16`` weights, your maximum
+consumed memory will be half that required if eager computation was used
+instead.
+
+This pattern is simple to do in MLX thanks to lazy computation:
+
+.. code-block:: python
+
+  model = Model() # no memory used yet
+  model.load_weights("weights_fp16.safetensors")
+
+When to Evaluate
+----------------
+
+A common question is when to use :func:`eval`. The trade-off is between
+letting graphs get too large and not batching enough useful work.
+
+For example:
+
+.. code-block:: python
+
+  for _ in range(100):
+       a = a + b
+       mx.eval(a)
+       b = b * 2
+       mx.eval(b)
+
+This is a bad idea because there is some fixed overhead with each graph
+evaluation. On the other hand, there is some slight overhead which grows with
+the compute graph size, so extremely large graphs (while computationally
+correct) can be costly.
+
+Luckily, a wide range of compute graph sizes work pretty well with MLX:
+anything from a few tens of operations to many thousands of operations per
+evaluation should be okay.
+
+Most numerical computations have an iterative outer loop (e.g. the iteration in
+stochastic gradient descent). A natural and usually efficient place to use
+:func:`eval` is at each iteration of this outer loop.
+
+Here is a concrete example:
+
+.. code-block:: python
+
+   for batch in dataset:
+
+       # Nothing has been evaluated yet
+       loss, grad = value_and_grad_fn(model, batch)
+
+       # Still nothing has been evaluated
+       optimizer.update(model, grad)
+
+       # Evaluate the loss and the new parameters which will
+       # run the full gradient computation and optimizer update
+       mx.eval(loss, model.parameters())
+
+
+An important behavior to be aware of is when the graph will be implicitly
+evaluated. Anytime you ``print`` an array, convert it to an
+:obj:`numpy.ndarray`, or otherwise access it's memory via :obj:`memoryview`,
+the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
+saving functions) will also evaluate the array.
+
+
+Calling :func:`array.item` on a scalar array will also evaluate it. In the
+example above, printing the loss (``print(loss)``) or adding the loss scalar to
+a list (``losses.append(loss.item())``) would cause a graph evaluation. If 
+these lines are before ``mx.eval(loss, model.parameters())`` then this
+will be a partial evaluation, computing only the forward pass.
+
+Also, calling :func:`eval` on an array or set of arrays multiple times is
+perfectly fine. This is effectively a no-op.
+
+.. warning::
+
+  Using scalar arrays for control-flow will cause an evaluation.
+
+Here is an example:
+
+.. code-block:: python
+
+   def fun(x):
+       h, y = first_layer(x)
+       if y > 0:  # An evaluation is done here!
+           z  = second_layer_a(h)
+       else:
+           z  = second_layer_b(h)
+       return z
+
+Using arrays for control flow should be done with care. The above example works
+and can even be used with gradient transformations. However, this can be very
+inefficient if evaluations are done too frequently.
--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -0,0 +1,108 @@
+.. _numpy:
+
+Conversion to NumPy and Other Frameworks
+========================================
+
+MLX array implements the `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
+Let's convert an array to NumPy and back.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import numpy as np
+
+  a = mx.arange(3)
+  b = np.array(a) # copy of a
+  c = mx.array(b) # copy of b
+
+.. note::
+
+    Since NumPy does not support ``bfloat16`` arrays, you will need to convert to ``float16`` or ``float32`` first:
+    ``np.array(a.astype(mx.float32))``.
+    Otherwise, you will receive an error like: ``Item size 2 for PEP 3118 buffer format string does not match the dtype V item size 0.``
+
+By default, NumPy copies data to a new array. This can be prevented by creating an array view:
+
+.. code-block:: python
+
+  a = mx.arange(3)
+  a_view = np.array(a, copy=False)
+  print(a_view.flags.owndata) # False
+  a_view[0] = 1
+  print(a[0].item()) # 1
+
+A NumPy array view is a normal NumPy array, except that it does not own its memory.
+This means writing to the view is reflected in the original array.
+
+While this is quite powerful to prevent copying arrays, it should be noted that external changes to the memory of arrays cannot be reflected in gradients.
+
+Let's demonstrate this in an example:
+
+.. code-block:: python
+
+  def f(x):
+      x_view = np.array(x, copy=False)
+      x_view[:] *= x_view # modify memory without telling mx
+      return x.sum()
+
+  x = mx.array([3.0])
+  y, df = mx.value_and_grad(f)(x)
+  print("f(x) = x² =", y.item()) # 9.0
+  print("f'(x) = 2x !=", df.item()) # 1.0
+
+
+The function ``f`` indirectly modifies the array ``x`` through a memory view.
+However, this modification is not reflected in the gradient, as seen in the last line outputting ``1.0``,
+representing the gradient of the sum operation alone.
+The squaring of ``x`` occurs externally to MLX, meaning that no gradient is incorporated.
+It's important to note that a similar issue arises during array conversion and copying.
+For instance, a function defined as ``mx.array(np.array(x)**2).sum()`` would also result in an incorrect gradient,
+even though no in-place operations on MLX memory are executed.
+
+PyTorch
+-------
+
+.. warning:: 
+
+   PyTorch Support for :obj:`memoryview` is experimental and can break for
+   multi-dimensional arrays. Casting to NumPy first is advised for now.
+
+PyTorch supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import torch
+
+  a = mx.arange(3)
+  b = torch.tensor(memoryview(a))
+  c = mx.array(b.numpy())
+
+Conversion from PyTorch tensors back to arrays must be done via intermediate NumPy arrays with ``numpy()``.
+
+JAX
+---
+JAX fully supports the buffer protocol.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import jax.numpy as jnp
+
+  a = mx.arange(3)
+  b = jnp.array(a)
+  c = mx.array(b)
+
+TensorFlow
+----------
+
+TensorFlow supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import tensorflow as tf
+
+  a = mx.arange(3)
+  b = tf.constant(memoryview(a))
+  c = mx.array(b)
--- a/docs/src/usage/quick_start.rst
+++ b/docs/src/usage/quick_start.rst
@@ -40,6 +40,9 @@ automatically evaluate the array.
  >> np.array(c)   # Also evaluates c
  array([2., 4., 6., 8.], dtype=float32)

+
+See the page on :ref:`Lazy Evaluation <lazy eval>` for more details.
+
 Function and Graph Transformations
 ----------------------------------

--- a/docs/src/usage/saving_and_loading.rst
+++ b/docs/src/usage/saving_and_loading.rst
@@ -0,0 +1,81 @@
+.. _saving_and_loading:
+
+Saving and Loading Arrays
+=========================
+
+.. currentmodule:: mlx.core
+
+MLX supports multiple array serialization formats.
+
+.. list-table:: Serialization Formats
+   :widths: 20 8 25 25 
+   :header-rows: 1
+
+   * - Format 
+     - Extension 
+     - Function
+     - Notes 
+   * - NumPy 
+     - ``.npy`` 
+     - :func:`save`
+     - Single arrays only
+   * - NumPy archive 
+     - ``.npz`` 
+     - :func:`savez` and :func:`savez_compressed`
+     - Multiple arrays 
+   * - Safetensors
+     - ``.safetensors`` 
+     - :func:`save_safetensors`
+     - Multiple arrays 
+   * - GGUF 
+     - ``.gguf`` 
+     - :func:`save_gguf`
+     - Multiple arrays
+
+The :func:`load` function will load any of the supported serialization
+formats. It determines the format from the extensions. The output of
+:func:`load` depends on the format. 
+
+Here's an example of saving a single array to a file:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> mx.save("array", a)
+
+The array ``a`` will be saved in the file ``array.npy`` (notice the extension
+is automatically added). Including the extension is optional; if it is missing
+it will be added. You can load the array with:
+
+.. code-block:: shell
+
+   >>> mx.load("array.npy", a)
+   array([1], dtype=float32)
+
+Here's an example of saving several arrays to a single file:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> b = mx.array([2.0])
+   >>> mx.savez("arrays", a, b=b)
+
+For compatibility with :func:`numpy.savez` the MLX :func:`savez` takes arrays
+as arguments. If the keywords are missing, then default names will be
+provided. This can be loaded with:
+
+.. code-block:: shell
+
+   >>> mx.load("arrays.npz")
+   {'b': array([2], dtype=float32), 'arr_0': array([1], dtype=float32)}
+
+In this case :func:`load` returns a dictionary of names to arrays.
+
+The functions :func:`save_safetensors` and :func:`save_gguf` are similar to
+:func:`savez`, but they take as input a :obj:`dict` of string names to arrays:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> b = mx.array([2.0])
+   >>> mx.save_safetensors("arrays", {"a": a, "b": b})
--- a/docs/src/usage/unified_memory.rst
+++ b/docs/src/usage/unified_memory.rst
--- a/docs/src/usage/using_streams.rst
+++ b/docs/src/usage/using_streams.rst
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -104,7 +104,10 @@ void axpby_impl(
 }

 /** Fall back implementation for evaluation on CPU */
-void Axpby::eval(const std::vector<array>& inputs, array& out) {
+void Axpby::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& out_arr) {
+  auto out = out_arr[0];
  // Check the inputs (registered in the op while constructing the out array)
  assert(inputs.size() == 2);
  auto& x = inputs[0];
@@ -175,7 +178,10 @@ void axpby_impl_accelerate(
 }

 /** Evaluate primitive on CPU using accelerate specializations */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
+  auto out = outarr[0];
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
@@ -189,13 +195,15 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
  }

  // Fall back to common backend if specializations are not available
-  eval(inputs, out);
+  eval(inputs, outarr);
 }

 #else // Accelerate not available

 /** Evaluate primitive on CPU falling back to common backend */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
  eval(inputs, out);
 }

@@ -208,8 +216,11 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
 #ifdef _METAL_

 /** Evaluate primitive on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
  // Prepare inputs
+  auto out = outarr[0];
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
@@ -295,7 +306,9 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 #else // Metal is not available

 /** Fail evaluation on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
  throw std::runtime_error("Axpby has no GPU implementation.");
 }

@@ -306,7 +319,7 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 ///////////////////////////////////////////////////////////////////////////////

 /** The Jacobian-vector product. */
-array Axpby::jvp(
+std::vector<array> Axpby::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
@@ -321,32 +334,33 @@ array Axpby::jvp(
  if (argnums.size() > 1) {
    auto scale = argnums[0] == 0 ? alpha_ : beta_;
    auto scale_arr = array(scale, tangents[0].dtype());
-    return multiply(scale_arr, tangents[0], stream());
+    return {multiply(scale_arr, tangents[0], stream())};
  }
  // If, argnums = {0, 1}, we take contributions from both
  // which gives us jvp = tangent_x * alpha + tangent_y * beta
  else {
-    return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+    return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
  }
 }

 /** The vector-Jacobian product. */
 std::vector<array> Axpby::vjp(
    const std::vector<array>& primals,
-    const array& cotan,
-    const std::vector<int>& argnums) {
+    const std::vector<array>& cotangents,
+    const std::vector<int>& argnums,
+    const std::vector<array>&) {
  // Reverse mode diff
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto scale = arg == 0 ? alpha_ : beta_;
-    auto scale_arr = array(scale, cotan.dtype());
-    vjps.push_back(multiply(scale_arr, cotan, stream()));
+    auto scale_arr = array(scale, cotangents[0].dtype());
+    vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
  }
  return vjps;
 }

 /** Vectorize primitive along given axis */
-std::pair<array, int> Axpby::vmap(
+std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Axpby has no vmap implementation.");
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -42,11 +42,13 @@ class Axpby : public Primitive {
   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
-  void eval_cpu(const std::vector<array>& inputs, array& out) override;
-  void eval_gpu(const std::vector<array>& inputs, array& out) override;
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;
+  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;

  /** The Jacobian-vector product. */
-  array jvp(
+  std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;
@@ -54,8 +56,9 @@ class Axpby : public Primitive {
  /** The vector-Jacobian product. */
  std::vector<array> vjp(
      const std::vector<array>& primals,
-      const array& cotan,
-      const std::vector<int>& argnums) override;
+      const std::vector<array>& cotangents,
+      const std::vector<int>& argnums,
+      const std::vector<array>& outputs) override;

  /**
   * The primitive must know how to vectorize itself across
@@ -63,7 +66,7 @@ class Axpby : public Primitive {
   * representing the vectorized computation and the axis which
   * corresponds to the output vectorized dimension.
   */
-  std::pair<array, int> vmap(
+  std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;

@@ -80,7 +83,7 @@ class Axpby : public Primitive {
  float beta_;

  /** Fall back implementation for evaluation on CPU */
-  void eval(const std::vector<array>& inputs, array& out);
+  void eval(const std::vector<array>& inputs, std::vector<array>& out);
 };

 } // namespace mlx::core
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "pybind11>=2.10", "cmake>=3.24", "mlx @ git+https://github.com/mlx-explore/mlx@main"]
+build-backend = "setuptools.build_meta"
--- a/examples/python/linear_regression.py
+++ b/examples/python/linear_regression.py
@@ -41,6 +41,6 @@ error_norm = mx.sum(mx.square(w - w_star)).item() ** 0.5
 throughput = num_iters / (toc - tic)

 print(
-    f"Loss {loss.item():.5f}, |w-w*| = {error_norm:.5f}, "
+    f"Loss {loss.item():.5f}, L2 distance: |w-w*| = {error_norm:.5f}, "
    f"Throughput {throughput:.5f} (it/s)"
 )
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/graph_utils.cpp
@@ -19,7 +20,7 @@ target_sources(

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
-if (MLX_BUILD_ACCELERATE) 
+if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
 else()
  target_sources(
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <functional>

@@ -6,6 +6,7 @@
 #include "mlx/ops.h"
 #include "mlx/primitives.h"
 #include "mlx/transforms.h"
+#include "mlx/transforms_impl.h"

 namespace mlx::core {

@@ -21,6 +22,12 @@ std::pair<size_t, std::vector<size_t>> cum_prod(const std::vector<int>& shape) {
  return {cum_prod, strides};
 }

+/** Return true if we are currently performing a function transformation in
+ * order to keep the graph when evaluating tracer arrays. */
+bool in_tracing() {
+  return detail::InTracing::in_tracing();
+}
+
 } // namespace

 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
@@ -32,7 +39,7 @@ array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
 array::array(
    const std::vector<int>& shape,
    Dtype dtype,
-    std::unique_ptr<Primitive> primitive,
+    std::shared_ptr<Primitive> primitive,
    const std::vector<array>& inputs)
    : array_desc_(std::make_shared<ArrayDesc>(
          shape,
@@ -40,6 +47,34 @@ array::array(
          std::move(primitive),
          inputs)) {}

+array::array(
+    std::vector<int> shape,
+    Dtype dtype,
+    std::shared_ptr<Primitive> primitive,
+    std::vector<array>&& inputs)
+    : array_desc_(std::make_shared<ArrayDesc>(
+          std::move(shape),
+          dtype,
+          std::move(primitive),
+          std::move(inputs))) {}
+
+std::vector<array> array::make_arrays(
+    const std::vector<std::vector<int>>& shapes,
+    const std::vector<Dtype>& dtypes,
+    std::shared_ptr<Primitive> primitive,
+    const std::vector<array>& inputs) {
+  std::vector<array> outputs;
+  for (int i = 0; i < shapes.size(); ++i) {
+    outputs.push_back(array(shapes[i], dtypes[i], primitive, inputs));
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    auto siblings = outputs;
+    siblings.erase(siblings.begin() + i);
+    outputs[i].set_siblings(std::move(siblings), i);
+  }
+  return outputs;
+}
+
 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
          std::vector<int>{static_cast<int>(data.size())},
@@ -58,12 +93,26 @@ array::array(
 }

 void array::detach() {
+  for (auto& s : array_desc_->siblings) {
+    s.array_desc_->inputs.clear();
+    s.array_desc_->siblings.clear();
+    s.array_desc_->position = 0;
+    s.array_desc_->depth = 0;
+    s.array_desc_->primitive = nullptr;
+  }
  array_desc_->inputs.clear();
+  array_desc_->siblings.clear();
+  array_desc_->position = 0;
+  array_desc_->depth = 0;
  array_desc_->primitive = nullptr;
 }

-void array::eval(bool retain_graph /* = false */) {
-  mlx::core::eval({*this}, retain_graph);
+void array::eval() {
+  mlx::core::eval({*this});
+}
+
+bool array::is_tracer() const {
+  return array_desc_->is_tracer && in_tracing();
 }

 void array::set_data(allocator::Buffer buffer, deleter_t d) {
@@ -108,6 +157,14 @@ void array::copy_shared_buffer(const array& other) {
  copy_shared_buffer(other, other.strides(), other.flags(), other.data_size());
 }

+void array::move_shared_buffer(array other) {
+  array_desc_->data = std::move(other.array_desc_->data);
+  array_desc_->strides = other.strides();
+  array_desc_->flags = other.flags();
+  array_desc_->data_size = other.data_size();
+  array_desc_->data_ptr = other.array_desc_->data_ptr;
+}
+
 array::ArrayDesc::ArrayDesc(const std::vector<int>& shape, Dtype dtype)
    : shape(shape), dtype(dtype) {
  std::tie(size, strides) = cum_prod(shape);
@@ -116,21 +173,43 @@ array::ArrayDesc::ArrayDesc(const std::vector<int>& shape, Dtype dtype)
 array::ArrayDesc::ArrayDesc(
    const std::vector<int>& shape,
    Dtype dtype,
-    std::unique_ptr<Primitive> primitive,
+    std::shared_ptr<Primitive> primitive,
    const std::vector<array>& inputs)
    : shape(shape),
      dtype(dtype),
      primitive(std::move(primitive)),
      inputs(inputs) {
-  std::tie(size, strides) = cum_prod(shape);
+  std::tie(size, strides) = cum_prod(this->shape);
  for (auto& in : inputs) {
    is_tracer |= in.is_tracer();
+    depth = std::max(in.graph_depth(), depth);
  }
+  depth++;
 }

-// Needed because the Primitive type used in array.h is incomplete and the
-// compiler needs to see the call to the destructor after the type is complete.
-array::ArrayDesc::~ArrayDesc() = default;
+array::ArrayDesc::ArrayDesc(
+    std::vector<int>&& shape,
+    Dtype dtype,
+    std::shared_ptr<Primitive> primitive,
+    std::vector<array>&& inputs)
+    : shape(std::move(shape)),
+      dtype(dtype),
+      primitive(std::move(primitive)),
+      inputs(std::move(inputs)) {
+  std::tie(size, strides) = cum_prod(this->shape);
+  for (auto& in : inputs) {
+    is_tracer |= in.is_tracer();
+    depth = std::max(in.graph_depth(), depth);
+  }
+  depth++;
+}
+
+array::ArrayIterator::ArrayIterator(const array& arr, int idx)
+    : arr(arr), idx(idx) {
+  if (arr.ndim() == 0) {
+    throw std::invalid_argument("Cannot iterate over 0-d array.");
+  }
+}

 array::ArrayIterator::reference array::ArrayIterator::operator*() const {
  auto start = std::vector<int>(arr.ndim(), 0);
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -1,5 +1,4 @@
 // Copyright © 2023 Apple Inc.
-
 #pragma once
 #include <algorithm>
 #include <cstdint>
@@ -116,11 +115,11 @@ class array {
  };

  /** Evaluate the array. */
-  void eval(bool retain_graph = false);
+  void eval();

  /** Get the value from a scalar array. */
  template <typename T>
-  T item(bool retain_graph = false);
+  T item();

  struct ArrayIterator {
    using iterator_category = std::random_access_iterator_tag;
@@ -128,11 +127,7 @@ class array {
    using value_type = const array;
    using reference = value_type;

-    explicit ArrayIterator(const array& arr, int idx = 0) : arr(arr), idx(idx) {
-      if (arr.ndim() == 0) {
-        throw std::invalid_argument("Cannot iterate over 0-d array.");
-      }
-    }
+    explicit ArrayIterator(const array& arr, int idx = 0);

    reference operator*() const;

@@ -174,7 +169,19 @@ class array {
  array(
      const std::vector<int>& shape,
      Dtype dtype,
-      std::unique_ptr<Primitive> primitive,
+      std::shared_ptr<Primitive> primitive,
+      const std::vector<array>& inputs);
+
+  array(
+      std::vector<int> shape,
+      Dtype dtype,
+      std::shared_ptr<Primitive> primitive,
+      std::vector<array>&& inputs);
+
+  static std::vector<array> make_arrays(
+      const std::vector<std::vector<int>>& shapes,
+      const std::vector<Dtype>& dtypes,
+      std::shared_ptr<Primitive> primitive,
      const std::vector<array>& inputs);

  /** A unique identifier for an array. */
@@ -182,6 +189,11 @@ class array {
    return reinterpret_cast<std::uintptr_t>(array_desc_.get());
  }

+  /** A unique identifier for an arrays primitive. */
+  std::uintptr_t primitive_id() const {
+    return reinterpret_cast<std::uintptr_t>(array_desc_->primitive.get());
+  }
+
  struct Data {
    allocator::Buffer buffer;
    deleter_t d;
@@ -209,6 +221,11 @@ class array {
    return *(array_desc_->primitive);
  };

+  /** A shared pointer to the array's primitive. */
+  std::shared_ptr<Primitive>& primitive_ptr() const {
+    return array_desc_->primitive;
+  };
+
  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
@@ -219,12 +236,42 @@ class array {
    return array_desc_->inputs;
  };

-  /** A non-const reference to the array's inputs so that they can be used to
-   * edit the graph. */
-  std::vector<array>& editable_inputs() {
+  std::vector<array>& inputs() {
    return array_desc_->inputs;
  }

+  /** True indicates the arrays buffer is safe to reuse */
+  bool is_donatable() const {
+    return array_desc_.use_count() == 1 && (array_desc_->data.use_count() == 1);
+  }
+
+  /** The array's siblings. */
+  const std::vector<array>& siblings() const {
+    return array_desc_->siblings;
+  };
+
+  void set_siblings(std::vector<array> siblings, uint16_t position) {
+    array_desc_->siblings = std::move(siblings);
+    array_desc_->position = position;
+  }
+
+  /** The outputs of the array's primitive (i.e. this array and
+   * its siblings) in the order the primitive expects. */
+  std::vector<array> outputs() const {
+    auto idx = array_desc_->position;
+    std::vector<array> outputs;
+    outputs.reserve(siblings().size() + 1);
+    outputs.insert(outputs.end(), siblings().begin(), siblings().begin() + idx);
+    outputs.push_back(*this);
+    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
+    return outputs;
+  };
+
+  /** The depth of the array in the graph. Evaluated arrays have depth 0. */
+  uint16_t graph_depth() const {
+    return array_desc_->depth;
+  }
+
  /** Detach the array from the graph. */
  void detach();

@@ -245,6 +292,12 @@ class array {
    return array_desc_->data->buffer;
  };

+  // Return a copy of the shared pointer
+  // to the array::Data struct
+  std::shared_ptr<Data> data_shared_ptr() const {
+    return array_desc_->data;
+  }
+  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
    return static_cast<T*>(array_desc_->data_ptr);
@@ -265,9 +318,7 @@ class array {
    array_desc_->is_tracer = is_tracer;
  }
  // Check if the array is a tracer array
-  bool is_tracer() const {
-    return array_desc_->is_tracer;
-  }
+  bool is_tracer() const;

  void set_data(allocator::Buffer buffer, deleter_t d = allocator::free);

@@ -287,6 +338,8 @@ class array {

  void copy_shared_buffer(const array& other);

+  void move_shared_buffer(array other);
+
  void overwrite_descriptor(const array& other) {
    array_desc_ = other.array_desc_;
  }
@@ -301,7 +354,7 @@ class array {
    std::vector<size_t> strides;
    size_t size;
    Dtype dtype;
-    std::unique_ptr<Primitive> primitive{nullptr};
+    std::shared_ptr<Primitive> primitive{nullptr};

    // Indicates an array is being used in a graph transform
    // and should not be detached from the graph
@@ -323,22 +376,34 @@ class array {
    Flags flags;

    std::vector<array> inputs;
+    // An array to keep track of the siblings from a multi-output
+    // primitive.
+    std::vector<array> siblings;
+    // The arrays position in the output list
+    uint32_t position{0};
+
+    // The depth of the array in the graph.
+    uint16_t depth{0};

    explicit ArrayDesc(const std::vector<int>& shape, Dtype dtype);

    explicit ArrayDesc(
        const std::vector<int>& shape,
        Dtype dtype,
-        std::unique_ptr<Primitive> primitive,
+        std::shared_ptr<Primitive> primitive,
        const std::vector<array>& inputs);

-    ~ArrayDesc();
+    explicit ArrayDesc(
+        std::vector<int>&& shape,
+        Dtype dtype,
+        std::shared_ptr<Primitive> primitive,
+        std::vector<array>&& inputs);
  };

  // The ArrayDesc contains the details of the materialized array including the
  // shape, strides, the data type. It also includes
  // the primitive which knows how to compute the array's data from its inputs
-  // and a the list of array's inputs for the primitive.
+  // and the list of array's inputs for the primitive.
  std::shared_ptr<ArrayDesc> array_desc_{nullptr};
 };

@@ -381,11 +446,11 @@ array::array(
 }

 template <typename T>
-T array::item(bool retain_graph /* = false */) {
+T array::item() {
  if (size() != 1) {
    throw std::invalid_argument("item can only be called on arrays of size 1.");
  }
-  eval(retain_graph);
+  eval();
  return *data<T>();
 }

--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -29,12 +29,16 @@ std::tuple<bool, size_t, array> check_transpose(const array& arr) {
  }
 }

-inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+inline void matmul_cblas_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[matmul_cblas] on CPU currently only supports float32");
  }
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
@@ -42,6 +46,11 @@ inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  for (int i = 0; i < (a.size() / (M * K)); ++i) {
    cblas_sgemm(
        CblasRowMajor,
@@ -50,21 +59,34 @@ inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
        M,
        N,
        K,
-        1.0f, // alpha
+        alpha, // alpha
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
        lda,
        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
        ldb,
-        0.0f, // beta
+        beta, // beta
        out.data<float>() + M * N * i,
        out.shape(-1) // ldc
    );
  }
 }

-inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
-  // TODO: Update to utilize BNNS broadcasting
+inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_cblas_general(a_pre, b_pre, out);
+}
+
+inline void matmul_bnns_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  // TODO: Update to utilize BNNS broadcasting

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
@@ -72,11 +94,16 @@ inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());

  const BNNSLayerParametersBroadcastMatMul gemm_params{
-      /* float alpha = */ 1.0,
-      /* float beta = */ 0.0,
+      /* float alpha = */ alpha,
+      /* float beta = */ beta,
      /* bool transA = */ a_transposed,
      /* bool transB = */ b_transposed,
      /* bool quadratic = */ false,
@@ -157,6 +184,12 @@ inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
  BNNSFilterDestroy(bnns_filter);
 }

+inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
+  // TODO: Update to utilize BNNS broadcasting
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_bnns_general(a_pre, b_pre, out);
+}
+
 } // namespace

 void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -166,4 +199,16 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  return matmul_bnns(inputs[0], inputs[1], out);
 }

-} // namespace mlx::core
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  if (out.dtype() == float32) {
+    return matmul_cblas_general(inputs[0], inputs[1], out, alpha_, beta_);
+  }
+  return matmul_bnns_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <cassert>
 #include <cmath>
@@ -17,6 +17,12 @@
    primitive::eval(inputs, out);                                          \
  }

+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
 namespace mlx::core {

 // Use the default implementation for the following primitives
@@ -29,6 +35,8 @@ DEFAULT(Broadcast)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
 DEFAULT(Copy)
+DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(Depends)
 DEFAULT(Equal)
 DEFAULT(Erf)
 DEFAULT(ErfInv)
@@ -41,7 +49,11 @@ DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
 DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
 DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
 DEFAULT(NotEqual)
 DEFAULT(Pad)
 DEFAULT(Partition)
@@ -52,29 +64,22 @@ DEFAULT(Scatter)
 DEFAULT(Sigmoid)
 DEFAULT(Sign)
 DEFAULT(Slice)
+DEFAULT_MULTI(Split)
 DEFAULT(Sort)
 DEFAULT(StopGradient)
 DEFAULT(Transpose)
+DEFAULT_MULTI(DivMod)
+DEFAULT_MULTI(QRF)

 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else if (in.dtype() == int32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, in.data_size());
  } else if (is_unsigned(in.dtype())) {
    // No-op for unsigned types
    out.copy_shared_buffer(in);
@@ -127,12 +132,8 @@ void ArcCos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvacosf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -143,12 +144,8 @@ void ArcCosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvacoshf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -159,12 +156,8 @@ void ArcSin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvasinf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -175,12 +168,8 @@ void ArcSinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvasinhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -191,12 +180,8 @@ void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvatanf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -207,12 +192,8 @@ void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvatanhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -224,30 +205,23 @@ void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  if (in.flags().contiguous) {
-    auto allocfn = [&in, &out]() {
-      out.set_data(
-          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-          in.data_size(),
-          in.strides(),
-          in.flags());
-    };
    // Use accelerate functions if possible
    if (in.dtype() == float32 && out.dtype() == uint32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfixu32(
          in.data<float>(), 1, out.data<uint32_t>(), 1, in.data_size());
      return;
    } else if (in.dtype() == float32 && out.dtype() == int32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfix32(in.data<float>(), 1, out.data<int32_t>(), 1, in.data_size());
      return;
    } else if (in.dtype() == uint32 && out.dtype() == float32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfltu32(
          in.data<uint32_t>(), 1, out.data<float>(), 1, in.data_size());
      return;
    } else if (in.dtype() == int32 && out.dtype() == float32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vflt32(in.data<int32_t>(), 1, out.data<float>(), 1, in.data_size());
      return;
    }
@@ -259,12 +233,8 @@ void Cos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvcosf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -275,12 +245,8 @@ void Cosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvcoshf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -368,12 +334,8 @@ void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (is_floating_point(out.dtype())) {
    unary_fp(in, out, [](auto x) { return std::exp(x); });
@@ -400,12 +362,8 @@ void Log::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    switch (base_) {
      case Base::e:
        vvlogf(
@@ -429,12 +387,8 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvlog1pf(
        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (is_floating_point(out.dtype())) {
@@ -446,47 +400,6 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  if (out.dtype() == float32) {
-    binary(
-        a,
-        b,
-        out,
-        [](auto x, auto y) { return (x > y) ? x : y; },
-        UseDefaultBinaryOp(),
-        UseDefaultBinaryOp(),
-        [](const auto* a, const auto* b, auto* out, int n) {
-          vDSP_vmax((const float*)a, 1, (const float*)b, 1, (float*)out, 1, n);
-        });
-  } else {
-    binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
-  }
-}
-
-void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-
-  if (out.dtype() == float32) {
-    binary(
-        a,
-        b,
-        out,
-        [](auto x, auto y) { return (x < y) ? x : y; },
-        UseDefaultBinaryOp(),
-        UseDefaultBinaryOp(),
-        [](const auto* a, const auto* b, auto* out, int n) {
-          vDSP_vmin((const float*)a, 1, (const float*)b, 1, (float*)out, 1, n);
-        });
-  } else {
-    binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
-  }
-}
-
 void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -516,13 +429,8 @@ void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else {
    unary(in, out, [](auto x) { return -x; });
  }
@@ -535,7 +443,13 @@ void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.dtype() == float32 && a.flags().row_contiguous &&
      b.flags().row_contiguous) {
    int size = a.size();
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
    vvpowf(out.data<float>(), b.data<float>(), a.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -577,12 +491,8 @@ void Sin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvsinf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -593,12 +503,8 @@ void Sinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvsinhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -609,12 +515,8 @@ void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
  } else {
    unary(in, out, [](auto x) { return x * x; });
@@ -625,12 +527,8 @@ void Sqrt::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    if (recip_) {
      vvrsqrtf(out.data<float>(), in.data<float>(), &size);
    } else {
@@ -685,12 +583,8 @@ void Tan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvtanf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -701,12 +595,8 @@ void Tanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvtanhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -16,4 +16,5 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
 )
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -6,6 +6,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/binary_two.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -75,6 +76,61 @@ void Add::eval(const std::vector<array>& inputs, array& out) {
  binary(a, b, out, [](auto x, auto y) { return x + y; });
 }

+void DivMod::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto integral_op = [](auto x, auto y) {
+    return std::make_pair(x / y, x % y);
+  };
+  auto float_op = [](auto x, auto y) {
+    return std::make_pair(std::trunc(x / y), std::fmod(x, y));
+  };
+  switch (outputs[0].dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, outputs, integral_op);
+    case uint8:
+      binary_op<uint8_t>(a, b, outputs, integral_op);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, outputs, integral_op);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, outputs, integral_op);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, outputs, integral_op);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, outputs, integral_op);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, outputs, integral_op);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, outputs, integral_op);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, outputs, integral_op);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, outputs, float_op);
+      break;
+    case float32:
+      binary_op<float>(a, b, outputs, float_op);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, outputs, float_op);
+      break;
+    case complex64:
+      // Should never get here
+      throw std::runtime_error("[DivMod] Complex type not supported");
+      break;
+  }
+}
+
 void Divide::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -177,14 +233,33 @@ void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
+
+  if (is_floating_point(out.dtype())) {
+    binary(a, b, out, [](auto x, auto y) {
+      if (std::isnan(x)) {
+        return x;
+      }
+      return (x > y) ? x : y;
+    });
+  } else {
+    binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
+  }
 }

 void Minimum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
+  if (is_floating_point(out.dtype())) {
+    binary(a, b, out, [](auto x, auto y) {
+      if (std::isnan(x)) {
+        return x;
+      }
+      return (x < y) ? x : y;
+    });
+  } else {
+    binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
+  }
 }

 void Multiply::eval(const std::vector<array>& inputs, array& out) {
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,7 +1,6 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
-
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
@@ -40,29 +39,83 @@ void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
-    BinaryOpType bopt) {
+    BinaryOpType bopt,
+    bool donate_with_move = false) {
  switch (bopt) {
    case ScalarScalar:
      out.set_data(
          allocator::malloc_or_wait(out.itemsize()), 1, a.strides(), a.flags());
      break;
    case ScalarVector:
-      out.set_data(
-          allocator::malloc_or_wait(b.data_size() * out.itemsize()),
-          b.data_size(),
-          b.strides(),
-          b.flags());
+      if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(b.data_size() * out.itemsize()),
+            b.data_size(),
+            b.strides(),
+            b.flags());
+      }
      break;
    case VectorScalar:
+      if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            a.data_size(),
+            a.strides(),
+            a.flags());
+      }
+      break;
    case VectorVector:
-      out.set_data(
-          allocator::malloc_or_wait(a.data_size() * out.itemsize()),
-          a.data_size(),
-          a.strides(),
-          a.flags());
+      if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            a.data_size(),
+            a.strides(),
+            a.flags());
+      }
      break;
    case General:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      if (a.is_donatable() && a.flags().row_contiguous &&
+          a.itemsize() == out.itemsize() && a.size() == out.size()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else if (
+          b.is_donatable() && b.flags().row_contiguous &&
+          b.itemsize() == out.itemsize() && b.size() == out.size()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      }
      break;
  }
 }
@@ -73,6 +126,12 @@ struct UseDefaultBinaryOp {
    // Should we throw? This should normally never be called.
    assert(false);
  }
+
+  template <typename T, typename U>
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    // Should we throw? This should normally never be called.
+    assert(false);
+  }
 };

 template <typename T, typename U, typename Op>
@@ -89,6 +148,18 @@ struct DefaultVectorScalar {
      a++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    T scalar = *b;
+    while (size-- > 0) {
+      auto dst = op(*a, scalar);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      a++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
@@ -105,6 +176,18 @@ struct DefaultScalarVector {
      b++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    T scalar = *a;
+    while (size-- > 0) {
+      auto dst = op(scalar, *b);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      b++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
@@ -121,6 +204,18 @@ struct DefaultVectorVector {
      b++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    while (size-- > 0) {
+      auto dst = op(*a, *b);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      a++;
+      b++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -0,0 +1,536 @@
+// Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename U, typename Op>
+void binary_op_dims1(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < out_a.size(); ++i) {
+    auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+    dst_a[i] = dst.first;
+    dst_b[i] = dst.second;
+    a_idx += a.strides()[0];
+    b_idx += b.strides()[0];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims1(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int stride) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; i++) {
+    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+    a_idx += a.strides()[0];
+    b_idx += b.strides()[0];
+    dst_a += stride;
+    dst_b += stride;
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims2(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+      dst_a[out_idx] = dst.first;
+      dst_b[out_idx++] = dst.second;
+      a_idx += a.strides()[1];
+      b_idx += b.strides()[1];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims2(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int stride) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+      a_idx += a.strides()[1];
+      b_idx += b.strides()[1];
+      dst_a += stride;
+      dst_b += stride;
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims3(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      for (size_t k = 0; k < a.shape()[2]; ++k) {
+        auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+        dst_a[out_idx] = dst.first;
+        dst_b[out_idx++] = dst.second;
+        a_idx += a.strides()[2];
+        b_idx += b.strides()[2];
+      }
+      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
+      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims4(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      for (size_t k = 0; k < a.shape()[2]; ++k) {
+        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {
+          auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+          dst_a[out_idx] = dst.first;
+          dst_b[out_idx++] = dst.second;
+          a_idx += a.strides()[3];
+          b_idx += b.strides()[3];
+        }
+        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];
+        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];
+      }
+      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
+      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  switch (out_a.ndim()) {
+    case 1:
+      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 2:
+      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 3:
+      binary_op_dims3<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 4:
+      binary_op_dims4<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+  }
+
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  for (size_t i = 0; i < out_a.size(); i++) {
+    int a_idx = elem_to_loc(i, a.shape(), a.strides());
+    int b_idx = elem_to_loc(i, b.shape(), b.strides());
+    std::tie(dst_a[i], dst_b[i]) = op(a_ptr[a_idx], b_ptr[b_idx]);
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int dim,
+    int stride) {
+  // Number of dimensions to loop over for vectorized ops
+  switch (dim) {
+    case 1:
+      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op, stride);
+      return;
+    case 2:
+      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op, stride);
+      return;
+  }
+
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  for (size_t i = 0; i < out_a.size(); i += stride) {
+    int a_idx = elem_to_loc(i, a.shape(), a.strides());
+    int b_idx = elem_to_loc(i, b.shape(), b.strides());
+    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+    dst_a += stride;
+    dst_b += stride;
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    typename OpSV,
+    typename OpVS,
+    typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  // The full computation is scalar scalar so call the base op once
+  if (bopt == ScalarScalar) {
+    std::tie(*(out_a.data<U>()), *(out_b.data<U>())) =
+        op(*a.data<T>(), *b.data<T>());
+    return;
+  }
+
+  // The full computation is scalar vector so delegate to the op
+  if (bopt == ScalarVector) {
+    opsv(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        b.data_size());
+    return;
+  }
+
+  // The full computation is vector scalar so delegate to the op
+  if (bopt == VectorScalar) {
+    opvs(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        a.data_size());
+    return;
+  }
+
+  // The full computation is vector vector so delegate to the op
+  if (bopt == VectorVector) {
+    opvv(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        out_a.size());
+    return;
+  }
+
+  // General computation so let's try to optimize
+
+  // Get the left-most dim such that the array is row contiguous after
+  auto& strides = out_a.strides();
+  auto leftmost_rc_dim = [&strides](const array& arr) {
+    int d = arr.ndim() - 1;
+    for (; d >= 0 && arr.strides()[d] == strides[d]; d--) {
+    }
+    return d + 1;
+  };
+  auto a_rc_dim = leftmost_rc_dim(a);
+  auto b_rc_dim = leftmost_rc_dim(b);
+
+  // Get the left-most dim such that the array is a broadcasted "scalar" after
+  auto leftmost_s_dim = [](const array& arr) {
+    int d = arr.ndim() - 1;
+    for (; d >= 0 && arr.strides()[d] == 0; d--) {
+    }
+    return d + 1;
+  };
+  auto a_s_dim = leftmost_s_dim(a);
+  auto b_s_dim = leftmost_s_dim(b);
+
+  auto ndim = out_a.ndim();
+
+  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
+  int dim = ndim;
+  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
+    bopt = VectorVector;
+    dim = d;
+    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
+    bopt = VectorScalar;
+    dim = d;
+    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
+    bopt = ScalarVector;
+    dim = d;
+  }
+
+  // Can be sure dim > 0 since otherwise we would have used one of the fully
+  // contiguous methods above. Except for the case that the flags do not
+  // correspond to the underlying contiguity.
+  size_t stride;
+  if (dim == 0 || strides[dim - 1] < 16) {
+    stride = 1;
+    bopt = General;
+    dim = ndim;
+  } else {
+    stride = strides[dim - 1];
+  }
+
+  switch (bopt) {
+    case VectorVector:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvv, dim, stride);
+      break;
+    case VectorScalar:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvs, dim, stride);
+      break;
+    case ScalarVector:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opsv, dim, stride);
+      break;
+    default:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, op);
+      break;
+  }
+}
+
+template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  // TODO: The following mess of constexpr evaluations can probably be achieved
+  //       with template specializations and overloading. Would it be simpler?
+
+  if (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
+    if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+      if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
+        binary_op<T, T>(
+            a,
+            b,
+            outputs[0],
+            outputs[1],
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            DefaultVectorVector<T, T, Op>(op));
+      } else {
+        // opsv and opvs were UseDefaultBinaryOp
+        binary_op<T, T>(
+            a,
+            b,
+            outputs[0],
+            outputs[1],
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            opvv);
+      }
+    } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opsv and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opsv was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          opvv);
+    }
+  } else if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opvs and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opvs was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          opvv);
+    }
+  } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+    // opvv was UseDefaultBinaryOp
+    binary_op<T, T>(
+        a,
+        b,
+        outputs[0],
+        outputs[1],
+        op,
+        opsv,
+        opvs,
+        DefaultVectorVector<T, T, Op>(op));
+  } else {
+    // All ops provided
+    binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
+  }
+}
+
+template <typename T, typename Op>
+void binary_op(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Op op) {
+  DefaultScalarVector<T, T, Op> opsv(op);
+  DefaultVectorScalar<T, T, Op> opvs(op);
+  DefaultVectorVector<T, T, Op> opvv(op);
+  binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
+}
+
+template <typename... Ops>
+void binary(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Ops... ops) {
+  switch (outputs[0].dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, outputs, ops...);
+      break;
+    case uint8:
+      binary_op<uint8_t>(a, b, outputs, ops...);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, outputs, ops...);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, outputs, ops...);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, outputs, ops...);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, outputs, ops...);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, outputs, ops...);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, outputs, ops...);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, outputs, ops...);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, outputs, ops...);
+      break;
+    case float32:
+      binary_op<float>(a, b, outputs, ops...);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, outputs, ops...);
+      break;
+    case complex64:
+      binary_op<complex64_t>(a, b, outputs, ops...);
+      break;
+  }
+}
+
+} // namespace
+
+} // namespace mlx::core
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -289,11 +289,16 @@ void copy(const array& src, array& dst, CopyType ctype) {
  // Allocate the output
  switch (ctype) {
    case CopyType::Vector:
-      dst.set_data(
-          allocator::malloc_or_wait(src.data_size() * dst.itemsize()),
-          src.data_size(),
-          src.strides(),
-          src.flags());
+      if (src.is_donatable() && src.itemsize() == dst.itemsize()) {
+        dst.copy_shared_buffer(src);
+      } else {
+        auto size = src.data_size();
+        dst.set_data(
+            allocator::malloc_or_wait(size * dst.itemsize()),
+            size,
+            src.strides(),
+            src.flags());
+      }
      break;
    case CopyType::Scalar:
    case CopyType::General:
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #ifdef ACCELERATE_NEW_LAPACK
 #include <vecLib/cblas_new.h>
@@ -6,6 +6,8 @@
 #include <cblas.h>
 #endif

+#include <cstring>
+
 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
@@ -16,6 +18,12 @@
    primitive::eval(inputs, out);                                          \
  }

+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
 namespace mlx::core {

 DEFAULT(Abs)
@@ -39,6 +47,8 @@ DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
 DEFAULT(Cosh)
+DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(Depends)
 DEFAULT(Divide)
 DEFAULT(Remainder)
 DEFAULT(Equal)
@@ -57,6 +67,8 @@ DEFAULT(Load)
 DEFAULT(Log)
 DEFAULT(Log1p)
 DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
 DEFAULT(LogAddExp)
 DEFAULT(Maximum)
 DEFAULT(Minimum)
@@ -80,6 +92,7 @@ DEFAULT(Sinh)
 DEFAULT(Slice)
 DEFAULT(Softmax)
 DEFAULT(Sort)
+DEFAULT_MULTI(Split)
 DEFAULT(Square)
 DEFAULT(Sqrt)
 DEFAULT(StopGradient)
@@ -87,17 +100,17 @@ DEFAULT(Subtract)
 DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)
+DEFAULT_MULTI(DivMod)
+DEFAULT_MULTI(QRF)

-void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.dtype() != float32) {
-    throw std::runtime_error(
-        "[Matmul::eval_cpu] Currently only supports float32.");
-  }
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  auto& a_pre = inputs[0];
-  auto& b_pre = inputs[1];
+namespace {

+inline void matmul_common_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
  auto check_transpose = [](const array& arr) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
@@ -115,9 +128,15 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
-  int M = a.shape(-2);
-  int N = b.shape(-1);
-  int K = a.shape(-1);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  for (int i = 0; i < (a.size() / (M * K)); ++i) {
    cblas_sgemm(
        CblasRowMajor,
@@ -126,16 +145,41 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
        M,
        N,
        K,
-        1.0f, // alpha
+        alpha, // alpha
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
        lda,
        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
        ldb,
-        0.0f, // beta
+        beta, // beta
        out.data<float>() + M * N * i,
        out.shape(-1) // ldc
    );
  }
 }

+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[Matmul::eval_cpu] Currently only supports float32.");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_common_general(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[AddMM::eval_cpu] Currently only supports float32.");
+  }
+
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  return matmul_common_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -8,6 +8,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/arange.h"
+#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/erf.h"
 #include "mlx/backend/common/threefry.h"
@@ -231,22 +232,38 @@ void Cosh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void CustomVJP::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+       i++, j++) {
+    outputs[i].copy_shared_buffer(inputs[j]);
+  }
+}
+
+void Depends::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0; i < outputs.size(); i++) {
+    outputs[i].copy_shared_buffer(inputs[i]);
+  }
+}
+
 void Erf::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  switch (out.dtype()) {
    case float32:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float>(in, out, [](auto x) { return std::erf(x); });
      break;
    case float16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float16_t>(in, out, [](auto x) {
        return static_cast<float16_t>(std::erf(static_cast<float>(x)));
      });
      break;
    case bfloat16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<bfloat16_t>(in, out, [](auto x) {
        return static_cast<bfloat16_t>(std::erf(static_cast<float>(x)));
      });
@@ -263,17 +280,14 @@ void ErfInv::eval(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  switch (out.dtype()) {
    case float32:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float>(in, out, [](auto x) { return erfinv(x); });
      break;
    case float16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float16_t>(in, out, [](auto x) {
        return static_cast<float16_t>(erfinv(static_cast<float>(x)));
      });
      break;
    case bfloat16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<bfloat16_t>(in, out, [](auto x) {
        return static_cast<bfloat16_t>(erfinv(static_cast<float>(x)));
      });
@@ -364,6 +378,20 @@ void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
  unary(in, out, [](auto x) { return !x; });
 }

+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, [](auto x, auto y) { return x && y; });
+}
+
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalOr requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, [](auto x, auto y) { return x || y; });
+}
+
 void Negative::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -573,6 +601,58 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  out.copy_shared_buffer(in, strides, flags, data_size, data_offset);
 }

+void Split::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  auto compute_new_flags = [](const auto& shape,
+                              const auto& strides,
+                              size_t in_data_size,
+                              auto flags) {
+    size_t data_size = 1;
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.row_contiguous = true;
+    flags.col_contiguous = true;
+    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
+      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
+      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
+      f_stride *= shape[i];
+      b_stride *= shape[ri];
+      if (strides[i] > 0) {
+        data_size *= shape[i];
+      }
+    }
+
+    if (data_size == 1) {
+      // Broadcasted scalar array is contiguous.
+      flags.contiguous = true;
+    } else if (data_size == in_data_size) {
+      // Means we sliced a broadcasted dimension so leave the "no holes" flag
+      // alone.
+    } else {
+      // We sliced something. So either we are row or col contiguous or we
+      // punched a hole.
+      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+    }
+
+    return std::pair<decltype(flags), size_t>{flags, data_size};
+  };
+
+  std::vector<int> indices(1, 0);
+  indices.insert(indices.end(), indices_.begin(), indices_.end());
+  for (int i = 0; i < indices.size(); i++) {
+    size_t offset = indices[i] * in.strides()[axis_];
+    auto [new_flags, data_size] = compute_new_flags(
+        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
+    outputs[i].copy_shared_buffer(
+        in, in.strides(), new_flags, data_size, offset);
+  }
+}
+
 void Square::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -0,0 +1,153 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+
+#ifdef ACCELERATE_NEW_LAPACK
+#include <vecLib/lapack.h>
+#else
+#include <lapack.h>
+#endif
+
+namespace mlx::core {
+
+template <typename T>
+struct lpack;
+
+template <>
+struct lpack<float> {
+  static void xgeqrf(
+      const int* m,
+      const int* n,
+      float* a,
+      const int* lda,
+      float* tau,
+      float* work,
+      const int* lwork,
+      int* info) {
+    sgeqrf_(m, n, a, lda, tau, work, lwork, info);
+  }
+  static void xorgqr(
+      const int* m,
+      const int* n,
+      const int* k,
+      float* a,
+      const int* lda,
+      const float* tau,
+      float* work,
+      const int* lwork,
+      int* info) {
+    sorgqr_(m, n, k, a, lda, tau, work, lwork, info);
+  }
+};
+
+template <typename T>
+void qrf_impl(const array& a, array& q, array& r) {
+  const int M = a.shape(-2);
+  const int N = a.shape(-1);
+  const int lda = std::max(M, N);
+  size_t num_matrices = a.size() / (M * N);
+  int num_reflectors = std::min(M, N);
+  auto tau =
+      allocator::malloc_or_wait(sizeof(T) * num_matrices * num_reflectors);
+
+  // Copy A to inplace input and make it col-contiguous
+  array in(a.shape(), float32, nullptr, {});
+  auto flags = in.flags();
+
+  // Copy the input to be column contiguous
+  flags.col_contiguous = num_matrices == 1;
+  flags.row_contiguous = false;
+  std::vector<size_t> strides = in.strides();
+  strides[in.ndim() - 2] = 1;
+  strides[in.ndim() - 1] = M;
+  in.set_data(
+      allocator::malloc_or_wait(in.nbytes()), in.nbytes(), strides, flags);
+  copy_inplace(a, in, CopyType::GeneralGeneral);
+
+  T optimal_work;
+  int lwork = -1;
+  int info;
+
+  // Compute workspace size
+  lpack<T>::xgeqrf(
+      &M, &N, nullptr, &lda, nullptr, &optimal_work, &lwork, &info);
+
+  // Update workspace size
+  lwork = optimal_work;
+  auto work = allocator::malloc_or_wait(sizeof(T) * lwork);
+
+  // Loop over matrices
+  for (int i = 0; i < num_matrices; ++i) {
+    // Solve
+    lpack<T>::xgeqrf(
+        &M,
+        &N,
+        in.data<float>() + M * N * i,
+        &lda,
+        static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
+        static_cast<T*>(work.raw_ptr()),
+        &lwork,
+        &info);
+  }
+  allocator::free(work);
+
+  r.set_data(allocator::malloc_or_wait(r.nbytes()));
+  copy_inplace(in, r, CopyType::General);
+
+  for (int i = 0; i < num_matrices; ++i) {
+    // Zero lower triangle
+    for (int j = 0; j < r.shape(-2); ++j) {
+      for (int k = 0; k < j; ++k) {
+        r.data<T>()[i * N * M + j * N + k] = 0;
+      }
+    }
+  }
+
+  // Get work size
+  lwork = -1;
+  lpack<T>::xorgqr(
+      &M,
+      &N,
+      &num_reflectors,
+      nullptr,
+      &lda,
+      nullptr,
+      &optimal_work,
+      &lwork,
+      &info);
+  lwork = optimal_work;
+  work = allocator::malloc_or_wait(sizeof(T) * lwork);
+
+  // Loop over matrices
+  for (int i = 0; i < num_matrices; ++i) {
+    // Compute Q
+    lpack<T>::xorgqr(
+        &M,
+        &N,
+        &num_reflectors,
+        in.data<float>() + M * N * i,
+        &lda,
+        static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
+        static_cast<T*>(work.raw_ptr()),
+        &lwork,
+        &info);
+  }
+
+  q.set_data(allocator::malloc_or_wait(q.nbytes()));
+  copy_inplace(in, q, CopyType::General);
+
+  // Cleanup
+  allocator::free(work);
+  allocator::free(tau);
+}
+
+void QRF::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
+  if (!(inputs[0].dtype() == float32)) {
+    throw std::runtime_error("[QRF::eval] only supports float32.");
+  }
+  qrf_impl<float>(inputs[0], outputs[0], outputs[1]);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -1,7 +1,6 @@
 // Copyright © 2023 Apple Inc.

 #include <cassert>
-#include <iostream>

 #include "mlx/backend/metal/copy.h"
 #include "mlx/primitives.h"
@@ -119,6 +118,12 @@ void _qmm_dispatch_typed(
  switch (bits) {
    case 2: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 2, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 2, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
          if (transposed_w) {
            return _qmm_t<T, 2, 64>(result, x, w, scales, biases, M, N, K);
@@ -135,6 +140,12 @@ void _qmm_dispatch_typed(
    }
    case 4: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 4, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 4, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
          if (transposed_w) {
            return _qmm_t<T, 4, 64>(result, x, w, scales, biases, M, N, K);
@@ -151,6 +162,12 @@ void _qmm_dispatch_typed(
    }
    case 8: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 8, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 8, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
          if (transposed_w) {
            return _qmm_t<T, 8, 64>(result, x, w, scales, biases, M, N, K);
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -56,23 +56,32 @@ struct SignOp {
 struct RoundOp {
  template <typename T>
  T operator()(T x) {
-    return std::round(x);
+    return std::rint(x);
  }

  complex64_t operator()(complex64_t x) {
-    return {std::round(x.real()), std::round(x.imag())};
+    return {std::rint(x.real()), std::rint(x.imag())};
  }
 };

+void set_unary_output_data(const array& in, array& out) {
+  if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+    out.copy_shared_buffer(in);
+  } else {
+    auto size = in.data_size();
+    out.set_data(
+        allocator::malloc_or_wait(size * out.itemsize()),
+        size,
+        in.strides(),
+        in.flags());
+  }
+}
+
 template <typename T, typename Op>
 void unary_op(const array& a, array& out, Op op) {
  const T* a_ptr = a.data<T>();
  if (a.flags().contiguous) {
-    out.set_data(
-        allocator::malloc_or_wait(a.data_size() * out.itemsize()),
-        a.data_size(),
-        a.strides(),
-        a.flags());
+    set_unary_output_data(a, out);
    T* dst = out.data<T>();
    for (size_t i = 0; i < a.data_size(); ++i) {
      dst[i] = op(a_ptr[i]);
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -23,12 +23,23 @@ void* Buffer::raw_ptr() {

 namespace metal {

+static bool cache_enabled_ = true;
+
+bool cache_enabled() {
+  return cache_enabled_;
+}
+
+void set_cache_enabled(bool enabled) {
+  cache_enabled_ = enabled;
+}
+
 namespace {

 BufferCache::BufferCache(MTL::Device* device)
    : device_(device), head_(nullptr), tail_(nullptr), pool_size_(0) {}

 BufferCache::~BufferCache() {
+  auto thread_pool = metal::new_scoped_memory_pool();
  clear();
 }

@@ -152,6 +163,11 @@ MetalAllocator::MetalAllocator()
      gc_limit_(0.95 * device_->recommendedMaxWorkingSetSize()) {}

 Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
+  // Metal doesn't like empty buffers
+  if (size == 0) {
+    return Buffer{nullptr};
+  }
+
  // Align up memory
  if (size > vm_page_size) {
    size = vm_page_size * ((size + vm_page_size - 1) / vm_page_size);
@@ -166,6 +182,8 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
      return Buffer{nullptr};
    }

+    auto thread_pool = metal::new_scoped_memory_pool();
+
    // If we have a lot of memory pressure, check if we can reclaim some memory
    // from the cache
    if (device_->currentAllocatedSize() + size >= gc_limit_) {
@@ -188,7 +206,11 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {

 void MetalAllocator::free(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
-  buffer_cache_.recycle_to_cache(buf);
+  if (cache_enabled()) {
+    buffer_cache_.recycle_to_cache(buf);
+  } else {
+    buf->release();
+  }
 }

 MetalAllocator& allocator() {
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -2,7 +2,6 @@

 #include <algorithm>
 #include <cassert>
-#include <iostream>
 #include <numeric>
 #include <sstream>

@@ -70,7 +69,7 @@ void explicit_gemm_conv_1D_gpu(

  // Perform gemm
  std::vector<array> copies = {in_padded, in_strided};
-  mlx_matmul(
+  return steel_matmul(
      s,
      d,
      /*a = */ in_strided,
@@ -262,7 +261,7 @@ void explicit_gemm_conv_2D_gpu(

  // Perform gemm
  std::vector<array> copies = {in_padded, in_strided};
-  mlx_matmul(
+  return steel_matmul(
      s,
      d,
      /*a = */ in_strided,
@@ -411,7 +410,7 @@ void winograd_conv_2D_gpu(
  copies_w.push_back(out_wg);
  {
    std::vector<array> empty_copies;
-    mlx_matmul(
+    steel_matmul(
        s,
        d,
        /*a = */ inp_wg,
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -12,14 +12,21 @@ namespace mlx::core {

 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
-    out.set_data(
-        allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-        in.data_size(),
-        in.strides(),
-        in.flags());
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+      out.move_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
+  if (out.size() == 0) {
+    return;
+  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
@@ -64,7 +71,8 @@ void copy_gpu_inplace(
  auto kernel = d.get_kernel(kname.str());
  auto compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
+  bool donate_in = in.data_shared_ptr() == nullptr;
+  set_array_buffer(compute_encoder, donate_in ? out : in, 0);
  set_array_buffer(compute_encoder, out, 1);

  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-24 Apple Inc.

 #include <dlfcn.h>
 #include <cstdlib>
@@ -19,16 +19,14 @@ namespace mlx::core::metal {

 namespace {

-// Catch things related to the main-thread static variables
-static std::shared_ptr<void> global_memory_pool = new_scoped_memory_pool();
-
 // TODO nicer way to set this or possibly expose as an environment variable
 static constexpr int MAX_BUFFERS_PER_QUEUE = 12;

 static constexpr const char* default_mtllib_path = METAL_PATH;

 auto load_device() {
-  MTL::Device* device = MTL::CreateSystemDefaultDevice();
+  auto devices = MTL::CopyAllDevices();
+  auto device = static_cast<MTL::Device*>(devices->object(0));
  if (!device) {
    throw std::runtime_error("Failed to load device");
  }
@@ -120,6 +118,7 @@ Device::Device() {
 }

 Device::~Device() {
+  auto pool = new_scoped_memory_pool();
  for (auto& q : queue_map_) {
    q.second->release();
  }
@@ -139,6 +138,8 @@ Device::~Device() {
 }

 void Device::new_queue(int index) {
+  auto thread_pool = metal::new_scoped_memory_pool();
+
  // Multiple threads can ask the device for queues
  // We lock this as a critical section for safety
  const std::lock_guard<std::mutex> lock(mtx_);
@@ -241,37 +242,127 @@ void Device::register_library(
  }
 }

-MTL::ComputePipelineState* Device::get_kernel(
-    const std::string& name,
-    const std::string& lib_name /* = "mlx" */) {
-  auto pool = new_scoped_memory_pool();
-  // Look for cached kernel
-  if (auto it = kernel_map_.find(name); it != kernel_map_.end()) {
-    return it->second;
-  }
-
-  // Prepare new kernel
-
+MTL::Library* Device::get_library_cache_(const std::string& lib_name) {
  // Search for cached metal lib
  MTL::Library* mtl_lib;
-  if (auto it = library_map_.find(name); it != library_map_.end()) {
+  if (auto it = library_map_.find(lib_name); it != library_map_.end()) {
    mtl_lib = it->second;
  } else { // Look for metallib alongside library
    register_library(lib_name);
    mtl_lib = library_map_[lib_name];
  }

+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library_(const std::string& source_string) {
+  auto pool = new_scoped_memory_pool();
+
+  auto ns_code =
+      NS::String::string(source_string.c_str(), NS::ASCIIStringEncoding);
+
+  NS::Error* error = nullptr;
+  auto mtl_lib = device_->newLibrary(ns_code, nullptr, &error);
+
+  // Throw error if unable to compile library
+  if (!mtl_lib) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load build metal library from source"
+        << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library_(const MTL::StitchedLibraryDescriptor* desc) {
+  auto pool = new_scoped_memory_pool();
+
+  NS::Error* error = nullptr;
+  auto mtl_lib = device_->newLibrary(desc, &error);
+
+  // Throw error if unable to compile library
+  if (!mtl_lib) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load build stitched metal library"
+        << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return mtl_lib;
+}
+
+MTL::Function* Device::get_function_(
+    const std::string& name,
+    MTL::Library* mtl_lib) {
  // Pull kernel from library
  auto ns_name = NS::String::string(name.c_str(), NS::ASCIIStringEncoding);
  auto mtl_function = mtl_lib->newFunction(ns_name);

+  return mtl_function;
+}
+
+MTL::Function* Device::get_function_(
+    const std::string& name,
+    const std::string& specialized_name,
+    const MTLFCList& func_consts,
+    MTL::Library* mtl_lib) {
+  if (func_consts.empty() && (specialized_name == name)) {
+    return get_function_(name, mtl_lib);
+  }
+
+  // Prepare function constants
+  auto mtl_func_consts = MTL::FunctionConstantValues::alloc()->init();
+
+  for (auto [value, type, index] : func_consts) {
+    mtl_func_consts->setConstantValue(value, type, index);
+  }
+
+  // Prepare function desc
+  auto desc = MTL::FunctionDescriptor::functionDescriptor();
+  desc->setName(NS::String::string(name.c_str(), NS::ASCIIStringEncoding));
+  desc->setSpecializedName(
+      NS::String::string(specialized_name.c_str(), NS::ASCIIStringEncoding));
+  desc->setConstantValues(mtl_func_consts);
+
+  // Pull kernel from library
+  NS::Error* error = nullptr;
+  auto mtl_function = mtl_lib->newFunction(desc, &error);
+
+  // Throw error if unable to build metal function
+  if (!mtl_function) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load function " << name << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  mtl_func_consts->release();
+  desc->release();
+
+  return mtl_function;
+}
+
+MTL::ComputePipelineState* Device::get_kernel_(
+    const std::string& name,
+    const MTL::Function* mtl_function) {
  // Compile kernel to compute pipeline
  NS::Error* error = nullptr;
  MTL::ComputePipelineState* kernel;
+
  if (mtl_function) {
    kernel = device_->newComputePipelineState(mtl_function, &error);
-    mtl_function->release();
  }
+
+  // Throw error if unable to compile metal function
  if (!mtl_function || !kernel) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load kernel " << name << "\n";
@@ -281,11 +372,170 @@ MTL::ComputePipelineState* Device::get_kernel(
    throw std::runtime_error(msg.str());
  }

-  // Add kernel to cache
-  kernel_map_.insert({name, kernel});
  return kernel;
 }

+MTL::ComputePipelineState* Device::get_kernel_(
+    const std::string& name,
+    const MTL::Function* mtl_function,
+    const MTL::LinkedFunctions* linked_functions) {
+  // Check inputs
+  if (!linked_functions) {
+    return get_kernel_(name, mtl_function);
+  }
+
+  if (!mtl_function) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load kernel " << name << "\n";
+    throw std::runtime_error(msg.str());
+  }
+
+  // Prepare compute pipeline state descriptor
+  auto desc = MTL::ComputePipelineDescriptor::alloc()->init();
+  desc->setComputeFunction(mtl_function);
+  desc->setLinkedFunctions(linked_functions);
+
+  // Compile kernel to compute pipeline
+  NS::Error* error = nullptr;
+  auto kernel = device_->newComputePipelineState(
+      desc, MTL::PipelineOptionNone, nullptr, &error);
+
+  // Throw error if unable to compile metal function
+  if (!kernel) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load kernel " << name << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return kernel;
+}
+
+MTL::Library* Device::get_library(
+    const std::string& name,
+    const std::string& source,
+    bool cache /* = true */) {
+  if (cache) {
+    if (auto it = library_map_.find(name); it != library_map_.end()) {
+      return it->second;
+    }
+  }
+
+  auto mtl_lib = get_library_(source);
+
+  if (cache) {
+    library_map_.insert({name, mtl_lib});
+  }
+
+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library(
+    const std::string& name,
+    const MTL::StitchedLibraryDescriptor* desc,
+    bool cache /* = true */) {
+  if (cache) {
+    if (auto it = library_map_.find(name); it != library_map_.end()) {
+      return it->second;
+    }
+  }
+
+  auto mtl_lib = get_library_(desc);
+
+  if (cache) {
+    library_map_.insert({name, mtl_lib});
+  }
+
+  return mtl_lib;
+}
+
+MTL::Function* Device::get_function(
+    const std::string& base_name,
+    MTL::Library* mtl_lib,
+    const std::string& specialized_name /* = "" */,
+    const MTLFCList& func_consts /* = {} */) {
+  return get_function_(base_name, specialized_name, func_consts, mtl_lib);
+}
+
+MTL::Function* Device::get_function(
+    const std::string& base_name,
+    const std::string& lib_name /* = "mlx" */,
+    const std::string& specialized_name /*  = "" */,
+    const MTLFCList& func_consts /* = {} */) {
+  // Search for cached metal lib
+  MTL::Library* mtl_lib = get_library_cache_(lib_name);
+
+  return get_function(base_name, mtl_lib, specialized_name, func_consts);
+}
+
+MTL::LinkedFunctions* Device::get_linked_functions_(
+    const std::vector<MTL::Function*>& funcs) {
+  if (funcs.empty()) {
+    return nullptr;
+  }
+
+  auto lfuncs = MTL::LinkedFunctions::linkedFunctions();
+
+  std::vector<NS::Object*> objs(funcs.size());
+  for (int i = 0; i < funcs.size(); i++) {
+    objs[i] = funcs[i];
+  }
+
+  NS::Array* funcs_arr = NS::Array::array(objs.data(), funcs.size());
+
+  lfuncs->setPrivateFunctions(funcs_arr);
+
+  return lfuncs;
+}
+
+MTL::ComputePipelineState* Device::get_kernel(
+    const std::string& base_name,
+    MTL::Library* mtl_lib,
+    const std::string& hash_name /* = "" */,
+    const MTLFCList& func_consts /* = {} */,
+    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
+  auto pool = new_scoped_memory_pool();
+
+  // Look for cached kernel
+  const auto& kname = hash_name.empty() ? base_name : hash_name;
+  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  // Pull kernel from library
+  auto mtl_function = get_function_(base_name, kname, func_consts, mtl_lib);
+
+  // Compile kernel to compute pipeline
+  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
+  auto kernel = get_kernel_(kname, mtl_function, mtl_linked_funcs);
+  mtl_function->release();
+  mtl_linked_funcs->release();
+
+  // Add kernel to cache
+  kernel_map_.insert({kname, kernel});
+  return kernel;
+}
+
+MTL::ComputePipelineState* Device::get_kernel(
+    const std::string& base_name,
+    const std::string& lib_name /* = "mlx" */,
+    const std::string& hash_name /*  = "" */,
+    const MTLFCList& func_consts /*  = {} */,
+    const std::vector<MTL::Function*>& linked_functions /*  = {} */) {
+  // Look for cached kernel
+  const auto& kname = hash_name.size() == 0 ? base_name : hash_name;
+  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  // Search for cached metal lib
+  MTL::Library* mtl_lib = get_library_cache_(lib_name);
+
+  return get_kernel(base_name, mtl_lib, kname, func_consts, linked_functions);
+}
+
 Device& device(mlx::core::Device) {
  static Device metal_device;
  return metal_device;
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-24 Apple Inc.

 #pragma once

@@ -31,6 +31,9 @@ inline std::string get_colocated_mtllib_path(const std::string& lib_name) {
  return mtllib_path;
 }

+using MTLFCList =
+    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;
+
 class Device {
 public:
  Device();
@@ -59,14 +62,71 @@ class Device {
      const std::function<std::string(const std::string&)>& lib_path_func =
          get_colocated_mtllib_path);

-  MTL::ComputePipelineState* get_kernel(
+  MTL::Library* get_library(
      const std::string& name,
-      const std::string& lib_name = "mlx");
+      const std::string& source_string,
+      bool cache = true);
+
+  MTL::Library* get_library(
+      const std::string& name,
+      const MTL::StitchedLibraryDescriptor* desc,
+      bool cache = true);
+
+  MTL::Function* get_function(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& specialized_name = "",
+      const MTLFCList& func_consts = {});
+
+  MTL::Function* get_function(
+      const std::string& base_name,
+      const std::string& lib_name = "mlx",
+      const std::string& specialized_name = "",
+      const MTLFCList& func_consts = {});
+
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      const std::string& lib_name = "mlx",
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});

  MTL::ArgumentEncoder* argument_encoder(
      const std::vector<MTL::ArgumentDescriptor*>& arg_descs) const;

 private:
+  MTL::Library* get_library_cache_(const std::string& name);
+
+  MTL::Library* get_library_(const std::string& source_string);
+  MTL::Library* get_library_(const MTL::StitchedLibraryDescriptor* desc);
+
+  MTL::Function* get_function_(const std::string& name, MTL::Library* mtl_lib);
+
+  MTL::Function* get_function_(
+      const std::string& name,
+      const std::string& specialized_name,
+      const MTLFCList& func_consts,
+      MTL::Library* mtl_lib);
+
+  MTL::LinkedFunctions* get_linked_functions_(
+      const std::vector<MTL::Function*>& funcs);
+
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function);
+
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function,
+      const MTL::LinkedFunctions* linked_functions);
+
  MTL::Device* device_;
  std::unordered_map<int32_t, MTL::CommandQueue*> queue_map_;
  std::unordered_map<int32_t, std::pair<int, MTL::CommandBuffer*>> buffer_map_;
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023 Apple Inc.
-
 #include <algorithm>
 #include <cassert>
 #include <numeric>
@@ -33,6 +32,9 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }

  auto& s = stream();
  auto& d = metal::device(s.device);
@@ -110,14 +112,18 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  for (int i = 0; i < nidx; ++i) {
    set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i);
  }
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), MTL::ResourceUsageRead);
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), MTL::ResourceUsageRead);
+  if (idx_ndim > 0) {
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()),
+        MTL::ResourceUsageRead);
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()),
+        MTL::ResourceUsageRead);
+  }
  *static_cast<int*>(arg_enc->constantData(nidx + 3)) = idx_ndim;

  // Set all the buffers
@@ -163,6 +169,11 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
      inputs[0].data_size() == 1 ? CopyType::Scalar : CopyType::General;
  copy_gpu(inputs[0], out, copy_type);

+  // Empty update
+  if (inputs.back().size() == 0) {
+    return;
+  }
+
  // Get stream
  auto& s = stream();
  auto& d = metal::device(s.device);
@@ -254,14 +265,18 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  for (int i = 0; i < nidx; ++i) {
    set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i);
  }
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), MTL::ResourceUsageRead);
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), MTL::ResourceUsageRead);
+  if (idx_ndim > 0) {
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()),
+        MTL::ResourceUsageRead);
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()),
+        MTL::ResourceUsageRead);
+  }
  *static_cast<int*>(arg_enc->constantData(nidx + 3)) = idx_ndim;

  compute_encoder->setBuffer(static_cast<MTL::Buffer*>(arg_buf.ptr()), 0, 0);
@@ -272,14 +287,32 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  set_array_buffer(compute_encoder, upd, 1);
  set_array_buffer(compute_encoder, out, 2);
-  compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3);
-  compute_encoder->setBytes(upd.strides().data(), upd_ndim * sizeof(size_t), 4);
+  if (upd_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    size_t stride_ = 0;
+    compute_encoder->setBytes(&shape_, sizeof(int), 3);
+    compute_encoder->setBytes(&stride_, sizeof(size_t), 4);
+  } else {
+    compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3);
+    compute_encoder->setBytes(
+        upd.strides().data(), upd_ndim * sizeof(size_t), 4);
+  }
  compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 5);
  compute_encoder->setBytes(&upd_size, sizeof(size_t), 6);

  size_t out_ndim = out.ndim();
-  compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7);
-  compute_encoder->setBytes(out.strides().data(), out_ndim * sizeof(size_t), 8);
+  if (out_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    size_t stride_ = 0;
+    compute_encoder->setBytes(&shape_, sizeof(int), 7);
+    compute_encoder->setBytes(&stride_, sizeof(size_t), 8);
+  } else {
+    compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7);
+    compute_encoder->setBytes(
+        out.strides().data(), out_ndim * sizeof(size_t), 8);
+  }
  compute_encoder->setBytes(&out_ndim, sizeof(size_t), 9);
  compute_encoder->setBytes(axes_.data(), axes_.size() * sizeof(int), 10);

--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(
  HEADERS
+  ${CMAKE_CURRENT_SOURCE_DIR}/atomic.h
  ${CMAKE_CURRENT_SOURCE_DIR}/bf16.h
  ${CMAKE_CURRENT_SOURCE_DIR}/bf16_math.h
  ${CMAKE_CURRENT_SOURCE_DIR}/complex.h
@@ -14,9 +15,9 @@ set(
  "arange"
  "arg_reduce"
  "binary"
+  "binary_two"
  "conv"
  "copy"
-  "gemm"
  "gemv"
  "quantized"
  "random"
@@ -28,26 +29,27 @@ set(
  "indexing"
 )

-function(build_kernel KERNEL)
-  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
-  set(HEADERS_PADDED ${HEADERS})
-  if(${KERNEL} STREQUAL "gemm")
-    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/gemm.h)
-  endif()
-  if(${KERNEL} STREQUAL "conv")
-    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/conv.h)
-  endif()
+function(build_kernel_base TARGET SRCFILE DEPS)
  add_custom_command(
    COMMAND xcrun -sdk macosx metal -Wall -Wextra
                  -fno-fast-math
                  -c ${SRCFILE} 
                  -I${PROJECT_SOURCE_DIR} 
-                  -o ${KERNEL}.air
-    DEPENDS ${SRCFILE} ${HEADERS_PADDED}
-    OUTPUT ${KERNEL}.air
-    COMMENT "Building ${KERNEL}.air"
+                  -o ${TARGET}.air
+    DEPENDS ${SRCFILE} ${DEPS}
+    OUTPUT ${TARGET}.air
+    COMMENT "Building ${TARGET}.air"
    VERBATIM
  )
+endfunction(build_kernel_base)
+
+function(build_kernel KERNEL)
+  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
+  set(HEADERS_PADDED ${HEADERS})
+  if(${KERNEL} STREQUAL "conv")
+    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/conv.h)
+  endif()
+  build_kernel_base(${KERNEL} ${SRCFILE} "${HEADERS_PADDED}")
 endfunction(build_kernel)

 foreach(KERNEL ${KERNELS})
@@ -55,6 +57,15 @@ foreach(KERNEL ${KERNELS})
  set(KERNEL_AIR ${KERNEL}.air ${KERNEL_AIR})
 endforeach()

+file(GLOB_RECURSE STEEL_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.metal)
+file(GLOB_RECURSE STEEL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.h)
+
+foreach(KERNEL ${STEEL_KERNELS})
+  cmake_path(GET KERNEL STEM TARGET)
+  build_kernel_base(${TARGET} ${KERNEL} "${STEEL_HEADERS}")
+  set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
+endforeach()
+
 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o ${MLX_METAL_PATH}/mlx.metallib
--- a/mlx/backend/metal/kernels/arg_reduce.metal
+++ b/mlx/backend/metal/kernels/arg_reduce.metal
@@ -63,18 +63,6 @@ struct ArgMax {
  }
 };

-bool simd_shuffle_down(bool data, uint16_t delta) {
-  return simd_shuffle_down(static_cast<uint32_t>(data), delta);
-}
-
-uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {
-  return as_type<uint64_t>(simd_shuffle_down(as_type<uint2>(data), delta));
-}
-
-int64_t simd_shuffle_down(int64_t data, uint16_t delta) {
-  return as_type<int64_t>(simd_shuffle_down(as_type<uint2>(data), delta));
-}
-
 template <typename U>
 IndexValPair<U> simd_shuffle_down(IndexValPair<U> data, uint16_t delta) {
  return IndexValPair<U>(
--- a/mlx/backend/metal/kernels/atomic.h
+++ b/mlx/backend/metal/kernels/atomic.h
@@ -38,49 +38,59 @@ struct mlx_atomic<T, enable_if_t<is_metal_atomic<T>>> {

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC T
-mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {
+mlx_atomic_load_explicit(device mlx_atomic<T>* object, uint offset) {
  return atomic_load_explicit(&(object[offset].val), memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  atomic_store_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_and_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_and_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  atomic_fetch_or_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_min_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_min_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_min_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_max_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_max_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_max_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_add_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_add_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_add_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_mul_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_mul_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  T expected = mlx_atomic_load_explicit(object, offset);
  while (!mlx_atomic_compare_exchange_weak_explicit(
      object, &expected, val * expected, offset)) {
@@ -92,7 +102,7 @@ METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread T* expected,
    T val,
-    int offset) {
+    uint offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
@@ -106,7 +116,7 @@ template <>
 METAL_FUNC void mlx_atomic_fetch_min_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
-    int offset) {
+    uint offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val < expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
@@ -121,7 +131,7 @@ template <>
 METAL_FUNC void mlx_atomic_fetch_max_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
-    int offset) {
+    uint offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val > expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
@@ -148,7 +158,7 @@ union uint_or_packed {

 template <typename T, typename Op>
 struct mlx_atomic_update_helper {
-  uint operator()(uint_or_packed<T> init, T update, int elem_offset) {
+  uint operator()(uint_or_packed<T> init, T update, uint elem_offset) {
    Op op;
    init.val[elem_offset] = op(update, init.val[elem_offset]);
    return init.bits;
@@ -159,9 +169,9 @@ template <typename T, typename Op>
 METAL_FUNC void mlx_atomic_update_and_store(
    device mlx_atomic<T>* object,
    T update,
-    int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+    uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;

  mlx_atomic_update_helper<T, Op> helper;
  uint_or_packed<T> expected;
@@ -242,9 +252,9 @@ struct __Min {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC T
-mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {
-  int pack_offset = offset / sizeof(T);
-  int elem_offset = offset % sizeof(T);
+mlx_atomic_load_explicit(device mlx_atomic<T>* object, uint offset) {
+  uint pack_offset = offset / sizeof(T);
+  uint elem_offset = offset % sizeof(T);
  uint_or_packed<T> packed_val;
  packed_val.bits =
      atomic_load_explicit(&(object[pack_offset].val), memory_order_relaxed);
@@ -253,15 +263,17 @@ mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  mlx_atomic_update_and_store<T, __None<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+METAL_FUNC void mlx_atomic_fetch_and_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = __UINT32_MAX__;
  identity.val[elem_offset] = val;
@@ -272,9 +284,9 @@ mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = 0;
  identity.val[elem_offset] = val;
@@ -284,26 +296,34 @@ mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_min_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_min_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Min<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_max_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_max_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Max<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_add_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_add_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Add<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_mul_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_mul_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Mul<T>>(object, val, offset);
 }

@@ -312,11 +332,11 @@ METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread uint* expected,
    uint val,
-    int offset) {
+    uint offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
      val,
      memory_order_relaxed,
      memory_order_relaxed);
-}
+}
--- a/mlx/backend/metal/kernels/binary.metal
+++ b/mlx/backend/metal/kernels/binary.metal
@@ -58,6 +58,9 @@ struct LessEqual {
 struct LogAddExp {
  template <typename T>
  T operator()(T x, T y) {
+    if (metal::isnan(x) || metal::isnan(y)) {
+      return metal::numeric_limits<T>::quiet_NaN();
+    }
    constexpr T inf = metal::numeric_limits<T>::infinity();
    T maxval = metal::max(x, y);
    T minval = metal::min(x, y);
@@ -67,20 +70,48 @@ struct LogAddExp {
 };

 struct Maximum {
-  template <typename T> T operator()(T x, T y) { return metal::max(x, y); }
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::max(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x > y ? x : y;
+  }

  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x >= y ? x : y;
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x > y ? x : y;
  }
 };

 struct Minimum {
-  template <typename T> T operator()(T x, T y) { return metal::min(x, y); }
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::min(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }

  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x <= y ? x : y;
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x < y ? x : y;
  }
 };

@@ -131,6 +162,16 @@ struct Subtract {
  template <typename T> T operator()(T x, T y) { return x - y; }
 };

+struct LogicalAnd {
+    template <typename T>
+    T operator()(T x, T y) { return x && y; };
+};
+
+struct LogicalOr {
+    template <typename T>
+    T operator()(T x, T y) { return x || y; };
+};
+
 template <typename T, typename U, typename Op>
 [[kernel]] void binary_op_s2s(
    device const T* a,
@@ -377,3 +418,6 @@ instantiate_binary_all(naneq, float16, half, bool, NaNEqual)
 instantiate_binary_all(naneq, float32, float, bool, NaNEqual)
 instantiate_binary_all(naneq, bfloat16, bfloat16_t, bool, NaNEqual)
 instantiate_binary_all(naneq, complex64, complex64_t, bool, NaNEqual)
+
+instantiate_binary_all(lor, bool_, bool, bool, LogicalOr)
+instantiate_binary_all(land, bool_, bool, bool, LogicalAnd)
--- a/mlx/backend/metal/kernels/binary_two.metal
+++ b/mlx/backend/metal/kernels/binary_two.metal
@@ -0,0 +1,259 @@
+// Copyright © 2023 Apple Inc.
+
+#include <metal_integer>
+#include <metal_math>
+
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/bf16.h"
+
+struct FloorDivide {
+  template <typename T> T operator()(T x, T y) { return x / y; }
+  template <> float operator()(float x, float y) { return trunc(x / y); }
+  template <> half operator()(half x, half y) { return trunc(x / y); }
+  template <> bfloat16_t operator()(bfloat16_t x, bfloat16_t y) { return trunc(x / y); }
+};
+
+struct Remainder {
+  template <typename T> T operator()(T x, T y) { return x % y; }
+  template <> float operator()(float x, float y) { return fmod(x, y); }
+  template <> half operator()(half x, half y) { return fmod(x, y); }
+  template <> bfloat16_t operator()(bfloat16_t x, bfloat16_t y) { return fmod(x, y); }
+};
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_s2s(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[0]);
+  d[index] = Op2()(a[0], b[0]);
+}
+
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_ss(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[0]);
+  d[index] = Op2()(a[0], b[0]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_sv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[index]);
+  d[index] = Op2()(a[0], b[index]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_vs(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[index], b[0]);
+  d[index] = Op2()(a[index], b[0]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_vv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[index], b[index]);
+  d[index] = Op2()(a[index], b[index]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd1(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]) {
+  auto a_idx = elem_to_loc_1(index, a_stride);
+  auto b_idx = elem_to_loc_1(index, b_stride);
+  c[index] = Op1()(a[a_idx], b[b_idx]);
+  d[index] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd2(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_2(index, a_strides);
+  auto b_idx = elem_to_loc_2(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
+  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
+  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd3(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_3(index, a_strides);
+  auto b_idx = elem_to_loc_3(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
+  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2, int DIM>
+[[kernel]] void binary_op_g_nd(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int shape[DIM],
+    constant const size_t a_strides[DIM],
+    constant const size_t b_strides[DIM],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
+  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
+  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
+  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
+}
+
+#define instantiate_binary(name, itype, otype, op1, op2, bopt) \
+  template [[host_name(name)]] \
+  [[kernel]] void binary_op_##bopt<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      uint index [[thread_position_in_grid]]);
+
+#define instantiate_binary_g_dim(name, itype, otype, op1, op2, dims) \
+  template [[host_name(name "_" #dims)]] \
+  [[kernel]] void binary_op_g_nd<itype, otype, op1, op2, dims>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const int shape[dims], \
+      constant const size_t a_strides[dims], \
+      constant const size_t b_strides[dims], \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]);
+
+#define instantiate_binary_g_nd(name, itype, otype, op1, op2) \
+  template [[host_name(name "_1")]] \
+  [[kernel]] void binary_op_g_nd1<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t& a_stride, \
+      constant const size_t& b_stride, \
+      uint index [[thread_position_in_grid]]); \
+  template [[host_name(name "_2")]] \
+  [[kernel]] void binary_op_g_nd2<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t a_strides[2], \
+      constant const size_t b_strides[2], \
+      uint2 index [[thread_position_in_grid]], \
+      uint2 grid_dim [[threads_per_grid]]); \
+  template [[host_name(name "_3")]] \
+  [[kernel]] void binary_op_g_nd3<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t a_strides[3], \
+      constant const size_t b_strides[3], \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]); \
+  instantiate_binary_g_dim(name, itype, otype, op1, op2, 4) \
+  instantiate_binary_g_dim(name, itype, otype, op1, op2, 5)
+
+
+#define instantiate_binary_g(name, itype, otype, op1, op2) \
+  template [[host_name(name)]] \
+  [[kernel]] void binary_op_g<itype, otype, op2, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const int* shape, \
+      constant const size_t* a_strides, \
+      constant const size_t* b_strides, \
+      constant const int& ndim, \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]);
+
+#define instantiate_binary_all(name, tname, itype, otype, op1, op2) \
+  instantiate_binary("ss" #name #tname, itype, otype, op1, op2, ss) \
+  instantiate_binary("sv" #name #tname, itype, otype, op1, op2, sv) \
+  instantiate_binary("vs" #name #tname, itype, otype, op1, op2, vs) \
+  instantiate_binary("vv" #name #tname, itype, otype, op1, op2, vv) \
+  instantiate_binary_g("g" #name #tname, itype, otype, op1, op2) \
+  instantiate_binary_g_nd("g" #name #tname, itype, otype, op1, op2)
+
+#define instantiate_binary_float(name, op1, op2) \
+  instantiate_binary_all(name, float16, half, half, op1, op2) \
+  instantiate_binary_all(name, float32, float, float, op1, op2) \
+  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op1, op2)
+
+#define instantiate_binary_types(name, op1, op2) \
+  instantiate_binary_all(name, bool_, bool, bool, op1, op2) \
+  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op1, op2) \
+  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op1, op2) \
+  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op1, op2) \
+  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op1, op2) \
+  instantiate_binary_all(name, int8, int8_t, int8_t, op1, op2) \
+  instantiate_binary_all(name, int16, int16_t, int16_t, op1, op2) \
+  instantiate_binary_all(name, int32, int32_t, int32_t, op1, op2) \
+  instantiate_binary_all(name, int64, int64_t, int64_t, op1, op2) \
+  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op1, op2) \
+  instantiate_binary_float(name, op1, op2)
+
+instantiate_binary_types(divmod, FloorDivide, Remainder)
--- a/mlx/backend/metal/kernels/gemm/conv.h
+++ b/mlx/backend/metal/kernels/gemm/conv.h
--- a/mlx/backend/metal/kernels/conv.metal
+++ b/mlx/backend/metal/kernels/conv.metal
@@ -5,7 +5,7 @@
 #include "mlx/backend/metal/kernels/conv_params.h"
 #include "mlx/backend/metal/kernels/bf16.h"

-#include "mlx/backend/metal/kernels/gemm/conv.h"
+#include "mlx/backend/metal/kernels/conv.h"

 using namespace metal;

--- a/mlx/backend/metal/kernels/gemm/gemm.h
+++ b/mlx/backend/metal/kernels/gemm/gemm.h
@@ -1,538 +0,0 @@
-// Copyright © 2023 Apple Inc.
-
-#pragma once
-
-#include <metal_simdgroup>
-#include <metal_simdgroup_matrix>
-#include <metal_stdlib>
-
-#define MLX_MTL_CONST static constant constexpr const
-
-using namespace metal;
-
-///////////////////////////////////////////////////////////////////////////////
-// Loading helper
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BROWS,
-    int BCOLS,
-    int BK,
-    int vec_size,
-    int tgp_size,
-    bool transpose,
-    bool ldK,
-    int tgp_padding = 0>
-struct BlockLoader {
-  // Destination dimensions
-  MLX_MTL_CONST int dst_fd = transpose ? BCOLS : BROWS;
-  MLX_MTL_CONST int dst_ld = (transpose ? BROWS : BCOLS) + tgp_padding;
-  MLX_MTL_CONST int n_vecs = (transpose ? BROWS : BCOLS) / vec_size;
-
-  // Stride along block row within the block
-  MLX_MTL_CONST int bstride = tgp_size / n_vecs;
-
-  // Leading dimension for src
-  const int src_ld;
-  // Stride along reduction axis between blocks
-  const int tstride;
-
-  // Thread location indices
-  const short thread_idx;
-  const short bi;
-  const short bj;
-
-  // threadgroup and device memory
-  threadgroup T* dst;
-  const device T* src;
-
-  /* Constructor */
-  METAL_FUNC BlockLoader(
-      const device T* src_,
-      const int src_ld_,
-      threadgroup T* dst_,
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint simd_lane_id [[thread_index_in_simdgroup]])
-      : src_ld(src_ld_),
-        tstride(
-            BK * ((int)(transpose ^ !ldK) * src_ld + (int)(transpose ^ ldK))),
-        thread_idx(simd_group_id * 32 + simd_lane_id),
-        bi(thread_idx / n_vecs),
-        bj(vec_size * (thread_idx % n_vecs)),
-        dst(dst_ + bi * dst_ld + bj),
-        src(src_ + bi * src_ld + bj) {}
-
-  /* Load from device memory into threadgroup memory - without bound checking */
-  METAL_FUNC void load_unsafe() const {
-#pragma clang loop unroll(full)
-    for (short i = 0; i < dst_fd; i += bstride) {
-#pragma clang loop unroll(full)
-      for (short j = 0; j < vec_size; j++) {
-        dst[i * dst_ld + j] = src[i * src_ld + j];
-      }
-    }
-  }
-
-  /* Load from device memory into threadgroup memory - with bound checking */
-  METAL_FUNC void load_safe(short2 src_tile_dim) const {
-    src_tile_dim = transpose ? src_tile_dim.yx : src_tile_dim.xy;
-
-    // Iterate over rows of block
-#pragma clang loop unroll(full)
-    for (short i = 0; i < dst_fd; i += bstride) {
-      // Row is in bounds, we check against column
-      if ((bi + i) < src_tile_dim.y) {
-        // Use fast thread memory for bound checks
-        short tmp_idx[vec_size];
-        T tmp_val[vec_size];
-
-        // Make sure tmp_idx only contains valid indices
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_idx[j] = bj + j < src_tile_dim.x ? j : 0;
-        }
-
-        // Read all valid indices into tmp_val
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_val[j] = src[i * src_ld + tmp_idx[j]];
-        }
-
-        // Zero out unneeded values
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_val[j] = bj + j < src_tile_dim.x ? tmp_val[j] : T(0);
-        }
-
-        // Copy values to threadgroup memory
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          dst[i * dst_ld + j] = tmp_val[j];
-        }
-      }
-
-      // Row is out of bounds, we just fill tgp memory with zeros
-      else {
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          dst[i * dst_ld + j] = T(0);
-        }
-      }
-    }
-  }
-
-  /* Iteration helper */
-  METAL_FUNC void next() {
-    src += tstride;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Transforms
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename OutT, typename InT>
-struct TransformNone {
-  static METAL_FUNC OutT apply(InT x) {
-    return static_cast<OutT>(x);
-  }
-};
-
-template <typename T>
-struct AccumHelper {
-  typedef float accum_type;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// MMA helper
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BM,
-    int BN,
-    int BK,
-    int WM,
-    int WN,
-    bool transpose_a,
-    bool transpose_b,
-    int tgp_padding_a = 0,
-    int tgp_padding_b = 0,
-    typename AccumType = typename AccumHelper<T>::accum_type,
-    typename Epilogue = TransformNone<T, AccumType>>
-struct BlockMMA {
-  // Warp tile size along M
-  MLX_MTL_CONST int TM = BM / (WM * 8);
-  // Warp tile size along N
-  MLX_MTL_CONST int TN = BN / (WN * 8);
-
-  // Warp tile simdgroup matrix strides along M
-  MLX_MTL_CONST int TM_stride = 8 * WM;
-  // Warp tile simdgroup matrix strides along M
-  MLX_MTL_CONST int TN_stride = 8 * WN;
-
-  // Leading dimensions of threadgroup A, B blocks
-  MLX_MTL_CONST int lda_tgp = (transpose_a ? BM : BK) + tgp_padding_a;
-  MLX_MTL_CONST int ldb_tgp = (transpose_b ? BK : BN) + tgp_padding_b;
-
-  // Strides of A, B along reduction axis
-  MLX_MTL_CONST short simd_stride_a =
-      transpose_a ? TM_stride : TM_stride * lda_tgp;
-  MLX_MTL_CONST short simd_stride_b =
-      transpose_b ? TN_stride * ldb_tgp : TN_stride;
-
-  // Jump between elements
-  MLX_MTL_CONST short jump_a = transpose_a ? lda_tgp : 1;
-  MLX_MTL_CONST short jump_b = transpose_b ? ldb_tgp : 1;
-
-  // Offsets within threadgroup
-  const int tm;
-  const int tn;
-
-  // Simdgroup matrices
-  simdgroup_matrix<AccumType, 8, 8> Asimd[TM];
-  simdgroup_matrix<AccumType, 8, 8> Bsimd[TN];
-  simdgroup_matrix<AccumType, 8, 8> results[TM * TN] = {
-      simdgroup_matrix<AccumType, 8, 8>(0)};
-
-  short sm;
-  short sn;
-
-  /* Constructor */
-  METAL_FUNC BlockMMA(
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint simd_lane_id [[thread_index_in_simdgroup]])
-      : tm(8 * (simd_group_id / WN)), tn(8 * (simd_group_id % WN)) {
-    short qid = simd_lane_id / 4;
-    sm = (qid & 4) + (simd_lane_id / 2) % 4;
-    sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
-  }
-
-  /* (BM, BK) X (BK, BN) multiply accumulate function */
-  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
-// Iterate over BK in blocks of 8
-#pragma clang loop unroll(full)
-    for (short kk = 0; kk < BK; kk += 8) {
-      short2 offset_a =
-          transpose_a ? short2(tm + sm, kk + sn) : short2(kk + sn, tm + sm);
-      short2 offset_b =
-          transpose_b ? short2(kk + sm, tn + sn) : short2(tn + sn, kk + sm);
-
-      const threadgroup T* As__ = As + offset_a.y * lda_tgp + offset_a.x;
-      const threadgroup T* Bs__ = Bs + offset_b.y * ldb_tgp + offset_b.x;
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Load elements from threadgroup A as simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short i = 0; i < TM; i++) {
-        Asimd[i].thread_elements()[0] = static_cast<AccumType>(As__[0]);
-        Asimd[i].thread_elements()[1] = static_cast<AccumType>(As__[jump_a]);
-        As__ += simd_stride_a;
-      }
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Load elements from threadgroup B as simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short j = 0; j < TN; j++) {
-        Bsimd[j].thread_elements()[0] = static_cast<AccumType>(Bs__[0]);
-        Bsimd[j].thread_elements()[1] = static_cast<AccumType>(Bs__[jump_b]);
-        Bs__ += simd_stride_b;
-      }
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Multiply and accumulate into result simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short i = 0; i < TM; i++) {
-#pragma clang loop unroll(full)
-        for (short j = 0; j < TN; j++) {
-          simdgroup_multiply_accumulate(
-              results[i * TN + j], Asimd[i], Bsimd[j], results[i * TN + j]);
-        }
-      }
-    }
-  }
-
-  /* Store results from simdgroup_matrix results into device memory */
-  METAL_FUNC void store_result(device T* C, const int ldc) const {
-#pragma clang loop unroll(full)
-    for (int i = 0; i < TM; i++) {
-#pragma clang loop unroll(full)
-      for (int j = 0; j < TN; j++) {
-        C[(i * TM_stride + sm + tm) * ldc + j * TN_stride + tn + sn] =
-            Epilogue::apply(results[i * TN + j].thread_elements()[0]);
-        C[(i * TM_stride + sm + tm) * ldc + j * TN_stride + tn + sn + 1] =
-            Epilogue::apply(results[i * TN + j].thread_elements()[1]);
-      }
-    }
-  }
-
-  METAL_FUNC void
-  store_result_safe(device T* C, const int ldc, short2 dst_tile_dims) const {
-#pragma clang loop unroll(full)
-    for (int i = 0; i < TM; i++) {
-      if (tm + i * TM_stride + sm < dst_tile_dims.y) {
-#pragma clang loop unroll(full)
-        for (int j = 0; j < TN; j++) {
-          if (tn + j * TN_stride + sn < dst_tile_dims.x) {
-            C[(tm + i * TM_stride + sm) * ldc + tn + j * TN_stride + sn] =
-                Epilogue::apply(results[i * TN + j].thread_elements()[0]);
-          }
-
-          if (tn + j * TN_stride + sn + 1 < dst_tile_dims.x) {
-            C[(tm + i * TM_stride + sm) * ldc + tn + j * TN_stride + sn + 1] =
-                Epilogue::apply(results[i * TN + j].thread_elements()[1]);
-          }
-        }
-      }
-    }
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// GEMM kernels
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BM,
-    int BN,
-    int BK,
-    int WM,
-    int WN,
-    bool transpose_a,
-    bool transpose_b,
-    bool MN_aligned,
-    bool K_aligned,
-    typename AccumType = typename AccumHelper<T>::accum_type,
-    typename Epilogue = TransformNone<T, AccumType>>
-struct GEMMKernel {
-  MLX_MTL_CONST short tgp_padding_a = 16 / sizeof(T);
-  MLX_MTL_CONST short tgp_padding_b = 16 / sizeof(T);
-  MLX_MTL_CONST short tgp_mem_size_a =
-      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
-  MLX_MTL_CONST short tgp_mem_size_b =
-      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
-  MLX_MTL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;
-
-  MLX_MTL_CONST short tgp_size = WM * WN * 32;
-  MLX_MTL_CONST short vec_size = (BM == 64 && BN == 64) ? 8 : 4;
-
-  using loader_a_t = BlockLoader<
-      T,
-      BM,
-      BK,
-      BK,
-      vec_size,
-      tgp_size,
-      transpose_a,
-      true,
-      tgp_padding_a>;
-  using loader_b_t = BlockLoader<
-      T,
-      BK,
-      BN,
-      BK,
-      vec_size,
-      tgp_size,
-      transpose_b,
-      false,
-      tgp_padding_b>;
-  using mma_t = BlockMMA<
-      T,
-      BM,
-      BN,
-      BK,
-      WM,
-      WN,
-      transpose_a,
-      transpose_b,
-      tgp_padding_a,
-      tgp_padding_b,
-      AccumType,
-      Epilogue>;
-
-  /* Main kernel function */
-  static METAL_FUNC void run(
-      const device T* A [[buffer(0)]],
-      const device T* B [[buffer(1)]],
-      device T* C [[buffer(2)]],
-      const constant int& M [[buffer(3)]],
-      const constant int& N [[buffer(4)]],
-      const constant int& K [[buffer(5)]],
-      const constant int& batch_stride_a [[buffer(6)]],
-      const constant int& batch_stride_b [[buffer(7)]],
-      const constant int& batch_stride_c [[buffer(8)]],
-      threadgroup T* tgp_memory [[threadgroup(0)]],
-      uint simd_lane_id [[thread_index_in_simdgroup]],
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint3 tid [[threadgroup_position_in_grid]],
-      uint3 lid [[thread_position_in_threadgroup]]) {
-    // Pacifying compiler
-    (void)lid;
-
-    // Adjust for batch
-    A += batch_stride_a * tid.z;
-    B += batch_stride_b * tid.z;
-    C += batch_stride_c * tid.z;
-
-    // Adjust for transpose
-    const int lda_dev = transpose_a ? M : K;
-    const int ldb_dev = transpose_b ? K : N;
-
-    // Find block in A, B, C
-    const int c_row = tid.y * BM;
-    const int c_col = tid.x * BN;
-
-    A += transpose_a ? c_row : c_row * K;
-    B += transpose_b ? c_col * K : c_col;
-    C += c_row * N + c_col;
-
-    // Prepare threadgroup memory for loading
-    threadgroup T* As = tgp_memory;
-    threadgroup T* Bs = tgp_memory + tgp_mem_size_a;
-
-    // Prepare threadgroup loading operations
-    loader_a_t loader_a(A, lda_dev, As, simd_group_id, simd_lane_id);
-    loader_b_t loader_b(B, ldb_dev, Bs, simd_group_id, simd_lane_id);
-
-    // Prepare threadgroup mma operation
-    mma_t mma_op(simd_group_id, simd_lane_id);
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // MNK aligned loop
-    if (MN_aligned && K_aligned) {
-      for (int k = 0; k < K; k += BK) {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        // Load elements into threadgroup
-        loader_a.load_unsafe();
-        loader_b.load_unsafe();
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // Multiply and accumulate threadgroup elements
-        mma_op.mma(As, Bs);
-
-        // Prepare for next iteration
-        loader_a.next();
-        loader_b.next();
-      }
-
-      threadgroup_barrier(mem_flags::mem_none);
-
-      // Store results to device memory
-      mma_op.store_result(C, N);
-      return;
-
-    }
-    ///////////////////////////////////////////////////////////////////////////////
-    // MN aligned, K unaligned loop
-    else if (MN_aligned && !K_aligned) {
-      // Main loop
-      int k = 0;
-      for (; k + BK <= K; k += BK) {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        // Load elements into threadgroup
-        loader_a.load_unsafe();
-        loader_b.load_unsafe();
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // Multiply and accumulate threadgroup elements
-        mma_op.mma(As, Bs);
-
-        // Prepare for next iteration
-        loader_a.next();
-        loader_b.next();
-      }
-
-      // Loop tail
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      loader_a.load_safe(short2(K - k, BM));
-      loader_b.load_safe(short2(BN, K - k));
-
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      mma_op.mma(As, Bs);
-
-      // Store results to device memory
-      mma_op.store_result(C, N);
-      return;
-
-    }
-    ///////////////////////////////////////////////////////////////////////////////
-    // MNK unaligned loop
-    else { // Loop over K - unaligned case
-
-      short2 src_tile_dims(min(BN, N - c_col), min(BM, M - c_row));
-
-      if (src_tile_dims.y == BM && src_tile_dims.x == BN) {
-        int k = 0;
-        for (; k + BK <= K; k += BK) {
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-          // Load elements into threadgroup
-          loader_a.load_unsafe();
-          loader_b.load_unsafe();
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          // Multiply and accumulate threadgroup elements
-          mma_op.mma(As, Bs);
-
-          // Prepare for next iteration
-          loader_a.next();
-          loader_b.next();
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-
-        if (k < K) {
-          loader_a.load_safe(short2(K - k, BM));
-          loader_b.load_safe(short2(BN, K - k));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          mma_op.mma(As, Bs);
-        }
-
-        mma_op.store_result(C, N);
-        return;
-
-      } else {
-        int k = 0;
-        for (; k + BK <= K; k += BK) {
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-          // Load elements into threadgroup
-          loader_a.load_safe(short2(BK, src_tile_dims.y));
-          loader_b.load_safe(short2(src_tile_dims.x, BK));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          // Multiply and accumulate threadgroup elements
-          mma_op.mma(As, Bs);
-
-          // Prepare for next iteration
-          loader_a.next();
-          loader_b.next();
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-
-        if (k < K) {
-          loader_a.load_safe(short2(K - k, src_tile_dims.y));
-          loader_b.load_safe(short2(src_tile_dims.x, K - k));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          mma_op.mma(As, Bs);
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-        mma_op.store_result_safe(C, N, src_tile_dims);
-
-        return;
-      }
-    }
-  }
-};
--- a/mlx/backend/metal/kernels/gemv.metal
+++ b/mlx/backend/metal/kernels/gemv.metal
@@ -121,8 +121,18 @@ struct GEMVKernel {
      for(int tm = 0; tm < TM; tm++) {

        // Load for the row 
-        for(int tn = 0; tn < TN; tn++) {
-          inter[tn] = mat[tm * in_vec_size + bn + tn];
+        if(bn + TN <= in_vec_size) {
+          #pragma clang loop unroll(full)
+          for(int tn = 0; tn < TN; tn++) {
+            inter[tn] = mat[tm * in_vec_size + bn + tn];
+          }
+
+        } else { // Edgecase
+          #pragma clang loop unroll(full)
+          for(int tn = 0; tn < TN; tn++) {
+            int col_idx = (bn + tn) < in_vec_size ? (bn + tn) : (in_vec_size - 1);
+            inter[tn] = mat[tm * in_vec_size + col_idx];
+          }
        }

        // Accumulate results
--- a/mlx/backend/metal/kernels/indexing.metal
+++ b/mlx/backend/metal/kernels/indexing.metal
@@ -173,8 +173,7 @@ template <typename T, typename IdxT, typename Op, int NIDX>
  auto out_offset = elem_to_loc(
      ind_offset, upd_shape + indices.ndim, out_strides, out_ndim);
  auto upd_idx = elem_to_loc(gid, upd_shape, upd_strides, upd_ndim);
-
-  op.atomic_update(out + out_idx + out_offset, updates[upd_idx]);
+  op.atomic_update(out, updates[upd_idx], out_idx + out_offset);
 }

 #define instantiate_scatter4(name, type, ind_type, op_type, nindex) \
--- a/mlx/backend/metal/kernels/quantized.metal
+++ b/mlx/backend/metal/kernels/quantized.metal
@@ -5,9 +5,10 @@

 #include "mlx/backend/metal/kernels/bf16.h"
 #include "mlx/backend/metal/kernels/defines.h"
-#include "mlx/backend/metal/kernels/gemm/gemm.h"
 #include "mlx/backend/metal/kernels/utils.h"

+#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
+
 using namespace metal;

 #define MLX_MTL_CONST static constant constexpr const
@@ -141,10 +142,11 @@ template <typename T, const int BM, const int BN, const int group_size, const in
  // Adjust positions
  const int out_vec_size_w = out_vec_size / el_per_int;
  const int out_vec_size_g = out_vec_size / group_size;
-  int out_col = (tid.y * BN + simd_gid) * el_per_int;
+  int out_col_start = tid.y * (BN * el_per_int);
+  int out_col = out_col_start + simd_gid * el_per_int;
  w += out_col / el_per_int;
-  scales += out_col / group_size;
-  biases += out_col / group_size;
+  scales += out_col_start / group_size;
+  biases += out_col_start / group_size;
  x += tid.z * in_vec_size;
  y += tid.z * out_vec_size + out_col;

@@ -154,23 +156,22 @@ template <typename T, const int BM, const int BN, const int group_size, const in

  // Loop over in_vec in blocks of colgroup
  for (int i=0; i<in_vec_size; i+=BM) {
+    int offset_lid = simd_lid + i;
+    int offset_gid = simd_gid + i;
+    bool thread_in_bounds = offset_lid < in_vec_size;
+    bool group_in_bounds = offset_gid < in_vec_size;
+
    // Load the vec to shared memory
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (simd_gid == 0) {
-      x_block[simd_lid] = x[simd_lid + i];
+      x_block[simd_lid] = (thread_in_bounds) ? x[offset_lid] : 0;
    }

    // Load the scales and biases to shared memory
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (simd_gid == 0) {
-      #pragma clang loop unroll(full)
-      for (int j=0; j<groups_per_block; j++) {
-        scales_block[simd_lid * groups_per_block + j] = scales[(i + simd_lid) * out_vec_size_g + j];
-      }
-      #pragma clang loop unroll(full)
-      for (int j=0; j<groups_per_block; j++) {
-        biases_block[simd_lid * groups_per_block + j] = biases[(i + simd_lid) * out_vec_size_g + j];
-      }
+    if (simd_lid < groups_per_block && group_in_bounds) {
+      scales_block[simd_gid * groups_per_block + simd_lid] = scales[offset_gid * out_vec_size_g + simd_lid];
+      biases_block[simd_gid * groups_per_block + simd_lid] = biases[offset_gid * out_vec_size_g + simd_lid];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

@@ -180,7 +181,7 @@ template <typename T, const int BM, const int BN, const int group_size, const in
    bias = biases_block[simd_lid * groups_per_block + (simd_gid * el_per_int) / group_size];

    // Load the matrix elements
-    w_local = w[(i + simd_lid) * out_vec_size_w];
+    w_local = (thread_in_bounds) ? w[offset_lid * out_vec_size_w] : 0;

    // Do all the work.
    #pragma clang loop unroll(full)
@@ -206,7 +207,7 @@ template <typename T, const int BM, const int BN, const int group_size, const in
 }


-template <typename T, const int BM, const int BK, const int BN, const int group_size, const int bits>
+template <typename T, const int BM, const int BK, const int BN, const int group_size, const int bits, const bool aligned_N>
 [[kernel]] void qmm_t(
    const device T* x [[buffer(0)]],
    const device uint32_t* w [[buffer(1)]],
@@ -236,8 +237,9 @@ template <typename T, const int BM, const int BK, const int BN, const int group_
  constexpr int w_els_per_thread = (BN * BK / el_per_int) / (SIMD_SIZE * WM * WN);

  // Instantiate the appropriate BlockMMA and Loader
-  using mma_t = BlockMMA<T, BM, BN, BK, WM, WN, false, true>;
-  using loader_x_t = BlockLoader<T, BM, BK, BK, 4, WM * WN * SIMD_SIZE, false, true, 0>;
+  using mma_t = mlx::steel::BlockMMA<T, T, BM, BN, BK, WM, WN, false, true, BK, BK>;
+  using loader_x_t = mlx::steel::BlockLoader<T, BM, BK, BK, 1, WM * WN * SIMD_SIZE, 1, 4>;
+

  threadgroup T scales_block[BN * groups_per_block];
  threadgroup T biases_block[BN * groups_per_block];
@@ -257,6 +259,7 @@ template <typename T, const int BM, const int BK, const int BN, const int group_

  // Make the x loader and mma operation
  const short num_els = min(BM, M - y_row);
+  const short num_outs = min(BN, N - y_col);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

@@ -292,21 +295,48 @@ template <typename T, const int BM, const int BK, const int BN, const int group_

    // Load the w tile
    {
-      for (int wo=0; wo<w_els_per_thread; wo++) {
-        int offset = lid * w_els_per_thread + wo;
-        int offset_row = offset / (BK / el_per_int);
-        int offset_col = offset % (BK / el_per_int);
-        const device uint32_t * w_local = w + offset_row * K_w + offset_col;
-        threadgroup T * Ws_local = Ws + offset_row * BK + offset_col * el_per_int;
+      if (!aligned_N && num_outs < BN) {
+        for (int wo=0; wo<w_els_per_thread; wo++) {
+          int offset = lid * w_els_per_thread + wo;
+          int offset_row = offset / (BK / el_per_int);
+          int offset_col = offset % (BK / el_per_int);
+          const device uint32_t * w_local = w + offset_row * K_w + offset_col;
+          threadgroup T * Ws_local = Ws + offset_row * BK + offset_col * el_per_int;

-        uint32_t wi = *w_local;
-        T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
-        T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+          if (y_col + offset_col < N) {
+            uint32_t wi = *w_local;
+            T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+            T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];

-        #pragma clang loop unroll(full)
-        for (int t=0; t<el_per_int; t++) {
-          Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
-          wi >>= bits;
+            #pragma clang loop unroll(full)
+            for (int t=0; t<el_per_int; t++) {
+              Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
+              wi >>= bits;
+            }
+          } else {
+            #pragma clang loop unroll(full)
+            for (int t=0; t<el_per_int; t++) {
+              Ws_local[t] = 0;
+            }
+          }
+        }
+      } else {
+        for (int wo=0; wo<w_els_per_thread; wo++) {
+          int offset = lid * w_els_per_thread + wo;
+          int offset_row = offset / (BK / el_per_int);
+          int offset_col = offset % (BK / el_per_int);
+          const device uint32_t * w_local = w + offset_row * K_w + offset_col;
+          threadgroup T * Ws_local = Ws + offset_row * BK + offset_col * el_per_int;
+
+          uint32_t wi = *w_local;
+          T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+          T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+
+          #pragma clang loop unroll(full)
+          for (int t=0; t<el_per_int; t++) {
+            Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
+            wi >>= bits;
+          }
        }
      }
    }
@@ -324,8 +354,8 @@ template <typename T, const int BM, const int BK, const int BN, const int group_

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);
-  if (num_els < BM) {
-    mma_op.store_result_safe(y, N, short2(BN, num_els));
+  if (num_els < BM || num_outs < BN) {
+    mma_op.store_result_safe(y, N, short2(num_outs, num_els));
  } else {
    mma_op.store_result(y, N);
  }
@@ -361,8 +391,8 @@ template <typename T, const int BM, const int BK, const int BN, const int group_
  constexpr int w_els_per_thread = (BK * BN / el_per_int) / (SIMD_SIZE * WM * WN);

  // Instantiate the appropriate BlockMMA and Loader
-  using mma_t = BlockMMA<T, BM, BN, BK, WM, WN, false, false>;
-  using loader_x_t = BlockLoader<T, BM, BK, BK, 4, WM * WN * SIMD_SIZE, false, true, 0>;
+  using mma_t = mlx::steel::BlockMMA<T, T, BM, BN, BK, WM, WN, false, false, BK, BN>;
+  using loader_x_t = mlx::steel::BlockLoader<T, BM, BK, BK, 1, WM * WN * SIMD_SIZE, 1, 4>;

  threadgroup T scales_block[BK * groups_per_block];
  threadgroup T biases_block[BK * groups_per_block];
@@ -417,21 +447,48 @@ template <typename T, const int BM, const int BK, const int BN, const int group_

    // Load the w tile
    {
-      for (int wo=0; wo<w_els_per_thread; wo++) {
-        int offset = lid * w_els_per_thread + wo;
-        int offset_row = offset / (BN / el_per_int);
-        int offset_col = offset % (BN / el_per_int);
-        const device uint32_t * w_local = w + offset_row * N_w + offset_col;
-        threadgroup T * Ws_local = Ws + offset_row * BN + offset_col * el_per_int;
+      if (k + BK >= K) {
+        for (int wo=0; wo<w_els_per_thread; wo++) {
+          int offset = lid * w_els_per_thread + wo;
+          int offset_row = offset / (BN / el_per_int);
+          int offset_col = offset % (BN / el_per_int);
+          const device uint32_t * w_local = w + offset_row * N_w + offset_col;
+          threadgroup T * Ws_local = Ws + offset_row * BN + offset_col * el_per_int;

-        uint32_t wi = *w_local;
-        T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
-        T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+          if (y_row + offset_row < K) {
+            uint32_t wi = *w_local;
+            T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+            T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];

-        #pragma clang loop unroll(full)
-        for (int t=0; t<el_per_int; t++) {
-          Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
-          wi >>= bits;
+            #pragma clang loop unroll(full)
+            for (int t=0; t<el_per_int; t++) {
+              Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
+              wi >>= bits;
+            }
+          } else {
+            #pragma clang loop unroll(full)
+            for (int t=0; t<el_per_int; t++) {
+              Ws_local[t] = 0;
+            }
+          }
+        }
+      } else {
+        for (int wo=0; wo<w_els_per_thread; wo++) {
+          int offset = lid * w_els_per_thread + wo;
+          int offset_row = offset / (BN / el_per_int);
+          int offset_col = offset % (BN / el_per_int);
+          const device uint32_t * w_local = w + offset_row * N_w + offset_col;
+          threadgroup T * Ws_local = Ws + offset_row * BN + offset_col * el_per_int;
+
+          uint32_t wi = *w_local;
+          T scale = scales_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+          T bias = biases_block[offset_row * groups_per_block + offset_col / (group_size / el_per_int)];
+
+          #pragma clang loop unroll(full)
+          for (int t=0; t<el_per_int; t++) {
+            Ws_local[t] = scale * static_cast<T>(wi & bitmask) + bias;
+            wi >>= bits;
+          }
        }
      }
    }
@@ -483,6 +540,9 @@ instantiate_qmv_types(128, 8)
 instantiate_qmv_types( 64, 2)
 instantiate_qmv_types( 64, 4)
 instantiate_qmv_types( 64, 8)
+instantiate_qmv_types( 32, 2)
+instantiate_qmv_types( 32, 4)
+instantiate_qmv_types( 32, 8)

 #define instantiate_qvm(name, itype, group_size, bits) \
  template [[host_name("qvm_" #name "_gs_" #group_size "_b_" #bits)]] \
@@ -510,10 +570,13 @@ instantiate_qvm_types(128, 8)
 instantiate_qvm_types( 64, 2)
 instantiate_qvm_types( 64, 4)
 instantiate_qvm_types( 64, 8)
+instantiate_qvm_types( 32, 2)
+instantiate_qvm_types( 32, 4)
+instantiate_qvm_types( 32, 8)

-#define instantiate_qmm_t(name, itype, group_size, bits) \
-  template [[host_name("qmm_t_" #name "_gs_" #group_size "_b_" #bits)]] \
-  [[kernel]] void qmm_t<itype, 32, 64, 32, group_size, bits>( \
+#define instantiate_qmm_t(name, itype, group_size, bits, aligned_N) \
+  template [[host_name("qmm_t_" #name "_gs_" #group_size "_b_" #bits "_alN_" #aligned_N)]] \
+  [[kernel]] void qmm_t<itype, 32, 64, 32, group_size, bits, aligned_N>( \
      const device itype* x [[buffer(0)]], \
      const device uint32_t* w [[buffer(1)]], \
      const device itype* scales [[buffer(2)]], \
@@ -528,9 +591,12 @@ instantiate_qvm_types( 64, 8)
      uint simd_lid [[thread_index_in_simdgroup]]);

 #define instantiate_qmm_t_types(group_size, bits) \
-  instantiate_qmm_t(float32, float, group_size, bits) \
-  instantiate_qmm_t(float16, half, group_size, bits) \
-  instantiate_qmm_t(bfloat16, bfloat16_t, group_size, bits)
+  instantiate_qmm_t(float32, float, group_size, bits, false) \
+  instantiate_qmm_t(float16, half, group_size, bits, false) \
+  instantiate_qmm_t(bfloat16, bfloat16_t, group_size, bits, false) \
+  instantiate_qmm_t(float32, float, group_size, bits, true) \
+  instantiate_qmm_t(float16, half, group_size, bits, true) \
+  instantiate_qmm_t(bfloat16, bfloat16_t, group_size, bits, true)

 instantiate_qmm_t_types(128, 2)
 instantiate_qmm_t_types(128, 4)
@@ -538,6 +604,9 @@ instantiate_qmm_t_types(128, 8)
 instantiate_qmm_t_types( 64, 2)
 instantiate_qmm_t_types( 64, 4)
 instantiate_qmm_t_types( 64, 8)
+instantiate_qmm_t_types( 32, 2)
+instantiate_qmm_t_types( 32, 4)
+instantiate_qmm_t_types( 32, 8)

 #define instantiate_qmm_n(name, itype, group_size, bits) \
  template [[host_name("qmm_n_" #name "_gs_" #group_size "_b_" #bits)]] \
@@ -566,3 +635,6 @@ instantiate_qmm_n_types(128, 8)
 instantiate_qmm_n_types( 64, 2)
 instantiate_qmm_n_types( 64, 4)
 instantiate_qmm_n_types( 64, 8)
+instantiate_qmm_n_types( 32, 2)
+instantiate_qmm_n_types( 32, 4)
+instantiate_qmm_n_types( 32, 8)
--- a/mlx/backend/metal/kernels/reduce.h
+++ b/mlx/backend/metal/kernels/reduce.h
@@ -16,7 +16,7 @@ union bool4_or_uint {

 struct None {
  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
    mlx_atomic_store_explicit(out, val, offset);
  }
 };
@@ -41,7 +41,7 @@ struct And {
    }
  }

-  void atomic_update(device mlx_atomic<bool>* out, bool val, int offset = 0) {
+  void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
    if (!val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
@@ -68,8 +68,8 @@ struct Or {
  void atomic_update(
      device mlx_atomic<unsigned int>* out,
      bool val,
-      int elem_idx,
-      int offset = 0) {
+      uint elem_idx,
+      uint offset = 0) {
    if (val) {
      bool4_or_uint update;
      update.b = {false, false, false, false};
@@ -78,7 +78,7 @@ struct Or {
    }
  }

-  void atomic_update(device mlx_atomic<bool>* out, bool val, int offset = 0) {
+  void atomic_update(device mlx_atomic<bool>* out, bool val, uint offset = 0) {
    if (val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
@@ -105,7 +105,7 @@ struct Sum {
  static constexpr constant U init = U(0);

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
    mlx_atomic_fetch_add_explicit(out, val, offset);
  }

@@ -125,7 +125,7 @@ struct Prod {
  static constexpr constant U init = U(1);

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
    mlx_atomic_fetch_mul_explicit(out, val, offset);
  }

@@ -145,7 +145,7 @@ struct Min {
  static constexpr constant U init = Limits<U>::max;

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
    mlx_atomic_fetch_min_explicit(out, val, offset);
  }

@@ -165,7 +165,7 @@ struct Max {
  static constexpr constant U init = Limits<U>::min;

  template <typename T>
-  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+  void atomic_update(device mlx_atomic<T>* out, T val, uint offset = 0) {
    mlx_atomic_fetch_max_explicit(out, val, offset);
  }

--- a/mlx/backend/metal/kernels/reduce.metal
+++ b/mlx/backend/metal/kernels/reduce.metal
@@ -24,11 +24,59 @@ template <typename T, typename Op>
      device otype *out [[buffer(1)]], \
      uint tid [[thread_position_in_grid]]);

-
 ///////////////////////////////////////////////////////////////////////////////
 // All reduce
 ///////////////////////////////////////////////////////////////////////////////

+template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
+inline U per_thread_all_reduce(
+    const device T *in,
+    const device size_t& in_size,
+    uint gid,
+    uint grid_size) {
+  Op op;
+  U total_val = Op::init;
+
+  if (gid * N_READS < in_size) {
+    in += gid * N_READS;
+
+    int r = 0;
+    for(; r < (int)ceildiv(in_size, grid_size * N_READS) - 1; r++) {
+      U vals[N_READS] = {op.init};
+
+      for(int i = 0; i < N_READS; i++) {
+        vals[i] = static_cast<U>(in[i]);
+      }
+      for(int i = 0; i < N_READS; i++) {
+        total_val = op(vals[i], total_val);
+      }
+
+      in += grid_size * N_READS;
+    }
+
+    // Separate case for the last set as we close the reduction size
+    size_t curr_idx = (gid + r * (size_t)grid_size) * N_READS;
+    if (curr_idx < in_size) {
+      int max_reads = in_size - curr_idx;
+      T vals[N_READS];
+
+      for(int i = 0, idx = 0; i < N_READS; i++, idx++) {
+        idx = idx < max_reads ? idx : max_reads - 1;
+        vals[i] = in[idx];
+      }
+      for(int i = 0; i < N_READS; i++) {
+        U val = i < max_reads ? vals[i] : Op::init;
+        total_val = op(static_cast<U>(val), total_val);
+      }
+    }
+  }
+
+  return total_val;
+}
+
+// NB: This kernel assumes threads_per_threadgroup is at most
+// 1024. This way with a simd_size of 32, we are guaranteed to
+// complete the reduction in two steps of simd-level reductions.
 template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
 [[kernel]] void all_reduce(
    const device T *in [[buffer(0)]],
@@ -40,53 +88,18 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
    uint simd_per_group [[simdgroups_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
-  // NB: this kernel assumes threads_per_threadgroup is at most
-  // 1024. This way with a simd_size of 32, we are guaranteed to
-  // complete the reduction in two steps of simd-level reductions.

  Op op;
  threadgroup U local_vals[simd_size];
-      
-  U total_val = Op::init;

-  in += gid * N_READS;
-      
-  int r = 0;
-  for(; r < (int)ceildiv(in_size, grid_size * N_READS) - 1; r++) {
-    U vals[N_READS] = {op.init}; 
-
-    for(int i = 0; i < N_READS; i++) {
-      vals[i] = static_cast<U>(in[i]);
-    }
-    for(int i = 0; i < N_READS; i++) {
-      total_val = op(vals[i], total_val);
-    }
-
-    in += grid_size * N_READS;
-  }
-
-  // Separate case for the last set as we close the reduction size 
-  size_t curr_idx = (gid + r * (size_t)grid_size) * N_READS;
-  if (curr_idx < in_size) {
-    int max_reads = in_size - curr_idx;
-    T vals[N_READS];
-
-    for(int i = 0, idx = 0; i < N_READS; i++, idx++) {
-      idx = idx < max_reads ? idx : max_reads - 1;
-      vals[i] = in[idx];
-    }
-    for(int i = 0; i < N_READS; i++) {
-      U val = i < max_reads ? vals[i] : Op::init; 
-      total_val = op(static_cast<U>(val), total_val);
-    }
-  }
+  U total_val = per_thread_all_reduce<T, U, Op, N_READS>(in, in_size, gid, grid_size);

  // Reduction within simd group
  total_val = op.simd_reduce(total_val);
  if (simd_lane_id == 0) {
    local_vals[simd_group_id] = total_val;
  }
-  
+
  // Reduction within thread group
  threadgroup_barrier(mem_flags::mem_threadgroup);
  total_val = lid < simd_per_group ? local_vals[lid] : op.init;
@@ -98,6 +111,46 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
  }
 }

+template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
+[[kernel]] void all_reduce_no_atomics(
+    const device T *in [[buffer(0)]],
+    device U *out [[buffer(1)]],
+    const device size_t& in_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint grid_size [[threads_per_grid]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint thread_group_id [[threadgroup_position_in_grid]]) {
+
+  Op op;
+  threadgroup U local_vals[simd_size];
+
+  U total_val = per_thread_all_reduce<T, U, Op, N_READS>(in, in_size, gid, grid_size);
+
+  // Reduction within simd group (simd_add isn't supported for uint64/int64 types)
+  for (uint16_t lane_offset = simd_size/2; lane_offset > 0; lane_offset /= 2) {
+    total_val = op(total_val, simd_shuffle_down(total_val, lane_offset));
+  }
+  // Write simd group reduction results to local memory
+  if (simd_lane_id == 0) {
+    local_vals[simd_group_id] = total_val;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // Reduction of simdgroup reduction results within threadgroup.
+  total_val = lid < simd_per_group ? local_vals[lid] : op.init;
+  for (uint16_t lane_offset = simd_size/2; lane_offset > 0; lane_offset /= 2) {
+    total_val = op(total_val, simd_shuffle_down(total_val, lane_offset));
+  }
+
+  // Reduction across threadgroups
+  if (lid == 0) {
+    out[thread_group_id] = total_val;
+  }
+}
+
 #define instantiate_all_reduce(name, itype, otype, op) \
  template [[host_name("all_reduce_" #name)]] \
  [[kernel]] void all_reduce<itype, otype, op>( \
@@ -111,11 +164,80 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
      uint simd_lane_id [[thread_index_in_simdgroup]], \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

+#define instantiate_all_reduce_no_atomics(name, itype, otype, op) \
+  template [[host_name("all_reduce_no_atomics_" #name)]] \
+  [[kernel]] void all_reduce_no_atomics<itype, otype, op>( \
+      const device itype *in [[buffer(0)]], \
+      device otype *out [[buffer(1)]], \
+      const device size_t& in_size [[buffer(2)]], \
+      uint gid [[thread_position_in_grid]], \
+      uint lid [[thread_position_in_threadgroup]], \
+      uint grid_size [[threads_per_grid]], \
+      uint simd_per_group [[simdgroups_per_threadgroup]], \
+      uint simd_lane_id [[thread_index_in_simdgroup]], \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]], \
+      uint thread_group_id [[threadgroup_position_in_grid]]);

 ///////////////////////////////////////////////////////////////////////////////
 // Row atomics
 ///////////////////////////////////////////////////////////////////////////////

+template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
+inline U per_thread_row_reduce(
+    const device T *in,
+    const constant size_t& reduction_size,
+    const constant size_t& out_size,
+    const constant int* shape,
+    const constant size_t* strides,
+    const constant int& ndim,
+    uint lsize_x,
+    uint lid_x,
+    uint2 tid) {
+
+  Op op;
+
+  // Each threadgroup handles 1 reduction
+  // TODO: Specializing elem_to_loc would be slightly faster
+  int idx = tid.y * out_size + tid.x;
+  int extra_offset = elem_to_loc(idx, shape, strides, ndim);
+  in += extra_offset + lid_x * N_READS;
+  
+  // The reduction is accumulated here
+  U total_val = Op::init;
+
+  // Loop over the reduction size within thread group
+  int r = 0;
+  for (; r < (int)ceildiv(reduction_size, N_READS*lsize_x) - 1; r++) {
+    T vals[N_READS]; 
+    for(int i = 0; i < N_READS; i++) {
+      vals[i] = in[i];
+    }
+    for(int i = 0; i < N_READS; i++) {
+      total_val = op(static_cast<U>(vals[i]), total_val);
+    }
+
+    in += lsize_x * N_READS;
+  }
+
+  // Separate case for the last set as we close the reduction size   
+  size_t reduction_index = (lid_x + (size_t)lsize_x * r) * N_READS;
+  if(reduction_index < reduction_size) {
+    int max_reads = reduction_size - reduction_index;
+
+    T vals[N_READS]; 
+    for(int i = 0; i < N_READS; i++) {
+      int idx = min(i, max_reads - 1);
+      vals[i] = static_cast<U>(in[idx]);
+    }
+    for(int i = 0; i < N_READS; i++) {
+      T val = i < max_reads ? vals[i] : Op::init;
+      total_val = op(static_cast<U>(val), total_val);
+    }
+  }
+
+  return total_val;
+}
+
 template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
 [[kernel]] void row_reduce_general(
    const device T *in [[buffer(0)]],
@@ -133,46 +255,9 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {

  Op op;
-
-  // Each threadgroup handles 1 reduction
-  // TODO: Specializing elem_to_loc would be slightly faster
-  int idx = tid.y * out_size + tid.x;
-  int extra_offset = elem_to_loc(idx, shape, strides, ndim);
-  in += extra_offset + lid.x * N_READS;
-  
-  // The reduction is accumulated here
-  U total_val = Op::init;
  threadgroup U local_vals[simd_size];

-  // Loop over the reduction size within thread group
-  int r = 0;
-  for (; r < (int)ceildiv(reduction_size, N_READS*lsize.x) - 1; r++) {
-    T vals[N_READS]; 
-    for(int i = 0; i < N_READS; i++) {
-      vals[i] = in[i];
-    }
-    for(int i = 0; i < N_READS; i++) {
-      total_val = op(static_cast<U>(vals[i]), total_val);
-    }
-
-    in += lsize.x * N_READS;
-  }
-
-  // Separate case for the last set as we close the reduction size   
-  size_t reduction_index = (lid.x + (size_t)lsize.x * r) * N_READS;
-  if(reduction_index < reduction_size) {
-    int max_reads = reduction_size - reduction_index;
-
-    T vals[N_READS]; 
-    for(int i = 0; i < N_READS; i++) {
-      int idx = min(i, max_reads - 1);
-      vals[i] = static_cast<U>(in[idx]);
-    }
-    for(int i = 0; i < N_READS; i++) {
-      T val = i < max_reads ? vals[i] : Op::init;
-      total_val = op(static_cast<U>(val), total_val);
-    }
-  }
+  U total_val = per_thread_row_reduce<T, U, Op, N_READS>(in, reduction_size, out_size, shape, strides, ndim, lsize.x, lid.x, tid.xy);

  total_val = op.simd_reduce(total_val);
  
@@ -194,6 +279,53 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
  }
 }

+template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
+[[kernel]] void row_reduce_general_no_atomics(
+    const device T *in [[buffer(0)]],
+    device U *out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant int* shape [[buffer(4)]],
+    const constant size_t* strides [[buffer(5)]],
+    const constant int& ndim [[buffer(6)]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 gsize [[threads_per_grid]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+
+  Op op;
+
+  threadgroup U local_vals[simd_size];
+  U total_val = per_thread_row_reduce<T, U, Op, N_READS>(in, reduction_size, out_size, shape, strides, ndim, lsize.x, lid.x, tid.xy);
+
+  // Reduction within simd group - simd_add isn't supported for int64 types
+  for (uint16_t i = simd_size/2; i > 0; i /= 2) {
+    total_val = op(total_val, simd_shuffle_down(total_val, i));
+  }
+
+  // Prepare next level
+  if (simd_lane_id == 0) {
+    local_vals[simd_group_id] = total_val;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // Reduction within thread group
+  // Only needed if thread group has multiple simd groups
+  if(ceildiv(reduction_size, N_READS) > simd_size) {
+    total_val = lid.x < simd_per_group ? local_vals[lid.x] : op.init;
+    for (uint16_t i = simd_size/2; i > 0; i /= 2) {
+      total_val = op(total_val, simd_shuffle_down(total_val, i));
+    }
+  }
+  // Write row reduce output for threadgroup with 1st thread in thread group
+  if (lid.x == 0) {
+    out[(ceildiv(gsize.y, lsize.y) * tid.x) + tid.y] = total_val;
+  }
+}
+
 #define instantiate_row_reduce_general(name, itype, otype, op) \
  template [[host_name("row_reduce_general_" #name)]] \
  [[kernel]] void row_reduce_general<itype, otype, op>( \
@@ -211,52 +343,59 @@ template <typename T, typename U, typename Op, int N_READS=REDUCE_N_READS>
      uint simd_per_group [[simdgroups_per_threadgroup]],  \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

+#define instantiate_row_reduce_general_no_atomics(name, itype, otype, op) \
+  template [[host_name("row_reduce_general_no_atomics_" #name)]] \
+  [[kernel]] void row_reduce_general_no_atomics<itype, otype, op>( \
+      const device itype *in [[buffer(0)]],  \
+      device otype *out [[buffer(1)]],  \
+      const constant size_t& reduction_size [[buffer(2)]],  \
+      const constant size_t& out_size [[buffer(3)]],  \
+      const constant int* shape [[buffer(4)]],  \
+      const constant size_t* strides [[buffer(5)]],  \
+      const constant int& ndim [[buffer(6)]],  \
+      uint3 lid [[thread_position_in_threadgroup]],  \
+      uint3 lsize [[threads_per_threadgroup]],  \
+      uint3 gsize [[threads_per_grid]], \
+      uint3 tid [[threadgroup_position_in_grid]],  \
+      uint simd_lane_id [[thread_index_in_simdgroup]],  \
+      uint simd_per_group [[simdgroups_per_threadgroup]],  \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

 ///////////////////////////////////////////////////////////////////////////////
 // Column reduce
 ///////////////////////////////////////////////////////////////////////////////

 template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
-inline void _contiguous_strided_reduce(
-    const device T *in, 
-    device mlx_atomic<U> *out, 
-    threadgroup U *local_data, 
-    uint in_idx, 
-    uint out_idx, 
-    uint reduction_size, 
-    uint reduction_stride, 
-    uint2 tid, 
-    uint2 lid, 
+inline U _contiguous_strided_reduce(
+    const device T *in,
+    threadgroup U *local_data,
+    uint in_idx,
+    uint reduction_size,
+    uint reduction_stride,
+    uint2 tid,
+    uint2 lid,
    uint2 lsize) {

  Op op;
-  T local_vals[N_READS]; 
+  U total_val = Op::init;

  uint base_offset = (tid.y * lsize.y + lid.y) * N_READS;
-
-  for(uint r = 0; r < N_READS; r++) {
-    uint offset = base_offset + r; 
-    offset = offset < reduction_size ? offset : reduction_size - 1; 
-    local_vals[r] = in[in_idx + offset * reduction_stride];
-  }
-
-  U total_val = Op::init; 
  for(uint r = 0; r < N_READS && (base_offset + r) < reduction_size; r++) {
-    total_val = op(static_cast<U>(total_val), local_vals[r]); 
+    uint offset = base_offset + r;
+    total_val = op(static_cast<U>(total_val), in[in_idx + offset * reduction_stride]);
  }
-  local_data[lsize.y * lid.x + lid.y] = total_val; 
-
+  local_data[lsize.y * lid.x + lid.y] = total_val;
  threadgroup_barrier(mem_flags::mem_threadgroup);

+  U val = Op::init;
  if(lid.y == 0) {
-    U val = op.init; 
-
+    // Perform reduction across columns in thread group
    for(uint i = 0; i < lsize.y; i++) {
-      val = op(val, local_data[lsize.y * lid.x + i]); 
+      val = op(val, local_data[lsize.y * lid.x + i]);
    }
-
-    op.atomic_update(out, val, out_idx);
  }
+
+  return val;
 }

 template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
@@ -265,13 +404,13 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
    device mlx_atomic<U> *out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant size_t& reduction_stride [[buffer(3)]],
-    const constant size_t& out_size [[buffer(4)]], 
+    const constant size_t& out_size [[buffer(4)]],
    const constant int* shape [[buffer(5)]],
    const constant size_t* strides [[buffer(6)]],
    const constant int& ndim [[buffer(7)]],
    threadgroup U *local_data [[threadgroup(0)]],
-    uint3 tid [[threadgroup_position_in_grid]], 
-    uint3 lid [[thread_position_in_threadgroup]], 
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]]) {
  auto out_idx = tid.x * lsize.x + lid.x;
  auto in_idx = elem_to_loc(
@@ -281,18 +420,66 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
    ndim
  );

+  Op op;
  if(out_idx < out_size) {
-    _contiguous_strided_reduce<T, U, Op, N_READS>(
-      in, 
-      out, 
-      local_data, 
-      in_idx,
-      out_idx, 
-      reduction_size, 
-      reduction_stride, 
-      tid.xy, 
-      lid.xy, 
-      lsize.xy); 
+    U val = _contiguous_strided_reduce<T, U, Op, N_READS>(
+              in,
+              local_data,
+              in_idx,
+              reduction_size,
+              reduction_stride,
+              tid.xy,
+              lid.xy,
+              lsize.xy);
+
+    // Write out reduction results generated by threadgroups working on specific output element, contiguously.
+    if (lid.y == 0) {
+      op.atomic_update(out, val, out_idx);
+    }
+  }
+}
+
+template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
+[[kernel]] void col_reduce_general_no_atomics(
+    const device T *in [[buffer(0)]],
+    device U *out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    threadgroup U *local_data [[threadgroup(0)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 gid [[thread_position_in_grid]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 gsize [[threads_per_grid]]) {
+  auto out_idx = tid.x * lsize.x + lid.x;
+  auto in_idx = elem_to_loc(
+    out_idx + tid.z * out_size,
+    shape,
+    strides,
+    ndim
+  );
+
+  if(out_idx < out_size) {
+    U val = _contiguous_strided_reduce<T, U, Op, N_READS>(
+              in,
+              local_data,
+              in_idx,
+              reduction_size,
+              reduction_stride,
+              tid.xy,
+              lid.xy,
+              lsize.xy);
+
+    // Write out reduction results generated by threadgroups working on specific output element, contiguously.
+    if (lid.y == 0) {
+      uint tgsize_y = ceildiv(gsize.y, lsize.y);
+      uint tgsize_z = ceildiv(gsize.z, lsize.z);
+      out[tgsize_y * tgsize_z * gid.x + tgsize_y * tid.z + tid.y] = val;
+    }
  }
 }

@@ -312,6 +499,23 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
      uint3 lid [[thread_position_in_threadgroup]], \
      uint3 lsize [[threads_per_threadgroup]]);

+#define instantiate_col_reduce_general_no_atomics(name, itype, otype, op) \
+  template [[host_name("col_reduce_general_no_atomics_" #name)]] \
+  [[kernel]] void col_reduce_general_no_atomics<itype, otype, op>( \
+      const device itype *in [[buffer(0)]], \
+      device otype *out [[buffer(1)]], \
+      const constant size_t& reduction_size [[buffer(2)]], \
+      const constant size_t& reduction_stride [[buffer(3)]], \
+      const constant size_t& out_size [[buffer(4)]], \
+      const constant int* shape [[buffer(5)]],  \
+      const constant size_t* strides [[buffer(6)]],  \
+      const constant int& ndim [[buffer(7)]],  \
+      threadgroup otype *local_data [[threadgroup(0)]], \
+      uint3 tid [[threadgroup_position_in_grid]], \
+      uint3 lid [[thread_position_in_threadgroup]], \
+      uint3 gid [[thread_position_in_grid]], \
+      uint3 lsize [[threads_per_threadgroup]], \
+      uint3 gsize [[threads_per_grid]]);

 ///////////////////////////////////////////////////////////////////////////////
 // Instantiations
@@ -322,6 +526,15 @@ template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
  instantiate_row_reduce_general(name, itype, otype, op) \
  instantiate_col_reduce_general(name, itype, otype, op)

+#define instantiate_reduce_no_atomics(name, itype, otype, op) \
+  instantiate_all_reduce_no_atomics(name, itype, otype, op) \
+  instantiate_row_reduce_general_no_atomics(name, itype, otype, op) \
+  instantiate_col_reduce_general_no_atomics(name, itype, otype, op)
+
+#define instantiate_same_reduce_no_atomics(name, tname, type, op) \
+  instantiate_init_reduce(name ##tname, type, op<type>) \
+  instantiate_reduce_no_atomics(name ##tname, type, type, op<type>)
+
 #define instantiate_same_reduce(name, tname, type, op) \
  instantiate_init_reduce(name ##tname, type, op<type>) \
  instantiate_reduce(name ##tname, type, type, op<type>)
@@ -353,6 +566,9 @@ instantiate_same_reduce(sum, int32, int32_t, Sum)
 instantiate_same_reduce(sum, float16, half, Sum)
 instantiate_same_reduce(sum, float32, float, Sum)

+instantiate_same_reduce_no_atomics(sum, int64, int64_t, Sum)
+instantiate_same_reduce_no_atomics(sum, uint64, uint64_t, Sum)
+
 instantiate_same_reduce(prod, uint8, uint8_t, Prod)
 instantiate_same_reduce(prod, uint16, uint16_t, Prod)
 instantiate_same_reduce(prod, uint32, uint32_t, Prod)
@@ -362,6 +578,9 @@ instantiate_same_reduce(prod, int32, int32_t, Prod)
 instantiate_same_reduce(prod, float16, half, Prod)
 instantiate_same_reduce(prod, float32, float, Prod)

+instantiate_same_reduce_no_atomics(prod, int64, int64_t, Prod)
+instantiate_same_reduce_no_atomics(prod, uint64, uint64_t, Prod)
+
 instantiate_same_reduce(sum, bfloat16, bfloat16_t, Sum)
 instantiate_same_reduce(prod, bfloat16, bfloat16_t, Prod)

@@ -381,6 +600,9 @@ instantiate_same_reduce(min_, int32, int32_t, Min)
 instantiate_same_reduce(min_, float16, half, Min)
 instantiate_same_reduce(min_, float32, float, Min)

+instantiate_same_reduce_no_atomics(min_, int64, int64_t, Min)
+instantiate_same_reduce_no_atomics(min_, uint64, uint64_t, Min)
+
 instantiate_same_reduce(max_, uint8, uint8_t, Max)
 instantiate_same_reduce(max_, uint16, uint16_t, Max)
 instantiate_same_reduce(max_, uint32, uint32_t, Max)
@@ -390,5 +612,8 @@ instantiate_same_reduce(max_, int32, int32_t, Max)
 instantiate_same_reduce(max_, float16, half, Max)
 instantiate_same_reduce(max_, float32, float, Max)

+instantiate_same_reduce_no_atomics(max_, int64, int64_t, Max)
+instantiate_same_reduce_no_atomics(max_, uint64, uint64_t, Max)
+
 instantiate_same_reduce(min_, bfloat16, bfloat16_t, Min)
 instantiate_same_reduce(max_, bfloat16, bfloat16_t, Max)
--- a/mlx/backend/metal/kernels/steel/gemm/gemm.h
+++ b/mlx/backend/metal/kernels/steel/gemm/gemm.h
@@ -0,0 +1,312 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/steel/gemm/loader.h"
+#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
+#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
+using namespace metal;
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernel class
+///////////////////////////////////////////////////////////////////////////////
+
+namespace mlx {
+namespace steel {
+
+template <bool M_aligned, bool N_aligned, bool K_aligned>
+struct LoopAlignment {};
+
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    bool MN_aligned,
+    bool K_aligned,
+    typename AccumType = typename AccumHelper<T>::accum_type,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct GEMMKernel {
+  STEEL_CONST short tgp_padding_a = 16 / sizeof(T);
+  STEEL_CONST short tgp_padding_b = 16 / sizeof(T);
+  STEEL_CONST short tgp_mem_size_a =
+      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
+  STEEL_CONST short tgp_mem_size_b =
+      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
+  STEEL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;
+
+  STEEL_CONST short tgp_size = WM * WN * 32;
+
+  using loader_a_t = BlockLoader<
+      T,
+      transpose_a ? BK : BM,
+      transpose_a ? BM : BK,
+      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
+      !transpose_a,
+      tgp_size>;
+  using loader_b_t = BlockLoader<
+      T,
+      transpose_b ? BN : BK,
+      transpose_b ? BK : BN,
+      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
+      transpose_b,
+      tgp_size>;
+  using mma_t = BlockMMA<
+      T,
+      U,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_a,
+      transpose_b,
+      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
+      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
+      AccumType,
+      Epilogue>;
+
+  /* Main kernel function */
+  template <bool M_aligned, bool N_aligned, bool K_aligned_>
+  static METAL_FUNC void gemm_loop(
+      threadgroup T* As [[threadgroup(0)]],
+      threadgroup T* Bs [[threadgroup(1)]],
+      const int gemm_k_iterations,
+      thread loader_a_t& loader_a,
+      thread loader_b_t& loader_b,
+      thread mma_t& mma_op,
+      thread const short& tgp_bm,
+      thread const short& tgp_bn,
+      thread const short& lbk,
+      LoopAlignment<M_aligned, N_aligned, K_aligned_> l = {}) {
+    // Appease the compiler
+    (void)l;
+
+    thread bool mask_A[loader_a_t::n_rows][loader_a_t::vec_size];
+    thread bool mask_B[loader_b_t::n_rows][loader_b_t::vec_size];
+
+    if (!M_aligned) {
+      short2 tile_dims_A =
+          transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);
+      loader_a.set_mask(tile_dims_A, mask_A);
+    }
+
+    if (!N_aligned) {
+      short2 tile_dims_B =
+          transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);
+      loader_b.set_mask(tile_dims_B, mask_B);
+    }
+
+    for (int k = 0; k < gemm_k_iterations; k++) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Load elements into threadgroup
+      if (M_aligned) {
+        loader_a.load_unsafe();
+      } else {
+        loader_a.load_safe(mask_A);
+      }
+
+      if (N_aligned) {
+        loader_b.load_unsafe();
+      } else {
+        loader_b.load_safe(mask_B);
+      }
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+
+      // Prepare for next iteration
+      loader_a.next();
+      loader_b.next();
+    }
+
+    if (!K_aligned_) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      short2 tile_dims_A_last =
+          transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);
+      short2 tile_dims_B_last =
+          transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);
+
+      loader_a.set_mask(tile_dims_A_last, mask_A);
+      loader_b.set_mask(tile_dims_B_last, mask_B);
+
+      loader_a.load_safe(mask_A);
+      loader_b.load_safe(mask_B);
+
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+
+      mma_op.mma(As, Bs);
+    }
+  }
+
+  /* Main kernel function */
+  static METAL_FUNC void run(
+      const device T* A [[buffer(0)]],
+      const device T* B [[buffer(1)]],
+      device U* C [[buffer(2)]],
+      const constant GEMMParams* params [[buffer(3)]],
+      threadgroup T* As [[threadgroup(0)]],
+      threadgroup T* Bs [[threadgroup(1)]],
+      uint simd_lane_id [[thread_index_in_simdgroup]],
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]]) {
+    // Pacifying compiler
+    (void)lid;
+
+    const int tid_y = ((tid.y) << params->swizzle_log) +
+        ((tid.x) & ((1 << params->swizzle_log) - 1));
+    const int tid_x = (tid.x) >> params->swizzle_log;
+
+    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
+      return;
+    }
+
+    threadgroup_barrier(mem_flags::mem_none);
+
+    // Find block in A, B, C
+    const int c_row = tid_y * BM;
+    const int c_col = tid_x * BN;
+
+    A += transpose_a ? c_row : c_row * params->lda;
+    B += transpose_b ? c_col * params->ldb : c_col;
+    C += c_row * params->ldc + c_col;
+
+    // Prepare threadgroup loading operations
+    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
+    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);
+
+    // Prepare threadgroup mma operation
+    thread mma_t mma_op(simd_group_id, simd_lane_id);
+
+    int gemm_k_iterations = params->gemm_k_iterations_aligned;
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // MNK aligned loop
+    if (MN_aligned) {
+      for (int k = 0; k < gemm_k_iterations; k++) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_unsafe();
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+
+      threadgroup_barrier(mem_flags::mem_none);
+
+      // Loop tail
+      if (!K_aligned) {
+        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
+        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
+        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);
+
+        thread bool mask_A[loader_a_t::n_rows][loader_a_t::vec_size];
+        thread bool mask_B[loader_b_t::n_rows][loader_b_t::vec_size];
+
+        loader_a.set_mask(tile_dims_A, mask_A);
+        loader_b.set_mask(tile_dims_B, mask_B);
+
+        loader_a.load_safe(mask_A);
+        loader_b.load_safe(mask_B);
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        mma_op.mma(As, Bs);
+      }
+
+      // Store results to device memory
+      mma_op.store_result(C, params->ldc);
+      return;
+
+    }
+    ///////////////////////////////////////////////////////////////////////////////
+    // MN unaligned loop
+    else { // Loop over K - unaligned case
+      short tgp_bm = min(BM, params->M - c_row);
+      short tgp_bn = min(BN, params->N - c_col);
+      short leftover_bk = params->K - params->gemm_k_iterations_aligned * BK;
+
+      if (tgp_bm == BM && tgp_bn == BN) {
+        gemm_loop<true, true, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+
+        mma_op.store_result(C, params->ldc);
+        return;
+
+      } else if (tgp_bn == BN) {
+        gemm_loop<false, true, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+
+        mma_op.store_result_safe(C, params->ldc, short2(tgp_bn, tgp_bm));
+        return;
+
+      } else if (tgp_bm == BM) {
+        gemm_loop<true, false, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+
+        mma_op.store_result_safe(C, params->ldc, short2(tgp_bn, tgp_bm));
+        return;
+
+      } else {
+        gemm_loop<false, false, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+
+        mma_op.store_result_safe(C, params->ldc, short2(tgp_bn, tgp_bm));
+        return;
+      }
+    }
+  }
+};
+
+} // namespace steel
+} // namespace mlx
--- a/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm.metal
+++ b/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm.metal
@@ -1,9 +1,10 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2024 Apple Inc.

 #include "mlx/backend/metal/kernels/bf16.h"
-#include "mlx/backend/metal/kernels/gemm/gemm.h"
+#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"

 using namespace metal;
+using namespace mlx::steel;

 ///////////////////////////////////////////////////////////////////////////////
 // GEMM kernels
@@ -23,26 +24,26 @@ template <typename T,
    const device T *A [[buffer(0)]],
    const device T *B [[buffer(1)]],
    device T *C [[buffer(2)]],
-    const constant int &M [[buffer(3)]],
-    const constant int &N [[buffer(4)]],
-    const constant int &K [[buffer(5)]],
-    const constant int &batch_stride_a [[buffer(6)]],
-    const constant int &batch_stride_b [[buffer(7)]],
-    const constant int &batch_stride_c [[buffer(8)]],
+    const constant GEMMParams* params [[buffer(3)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
-    uint3 lid [[thread_position_in_threadgroup]]) {
+    uint3 lid [[thread_position_in_threadgroup]]) { 
    
-    using gemm_kernel = GEMMKernel<T, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned>;
+    using gemm_kernel = GEMMKernel<T, T, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned>;
    
-    threadgroup T tgp_memory[gemm_kernel::tgp_mem_size];
+    threadgroup T As[gemm_kernel::tgp_mem_size_a];
+    threadgroup T Bs[gemm_kernel::tgp_mem_size_b];
+
+    // Adjust for batch
+    A += params->batch_stride_a * tid.z;
+    B += params->batch_stride_b * tid.z;
+    C += params->batch_stride_c * tid.z;

    gemm_kernel::run( 
      A, B, C, 
-      M, N, K, 
-      batch_stride_a, batch_stride_b, batch_stride_c,
-      tgp_memory,
+      params,
+      As, Bs,
      simd_lane_id, simd_group_id, tid, lid
    );
 }
@@ -52,17 +53,12 @@ template <typename T,
 ///////////////////////////////////////////////////////////////////////////////

 #define instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned) \
-  template [[host_name("gemm_" #tname "_"  #iname "_" #oname "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_MN_" #aname "_K_" #kname)]] \
+  template [[host_name("steel_gemm_" #tname "_"  #iname "_" #oname "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_MN_" #aname "_K_" #kname)]] \
  [[kernel]] void gemm<itype, bm, bn, bk, wm, wn, trans_a, trans_b, mn_aligned, k_aligned>( \
      const device itype *A [[buffer(0)]], \
      const device itype *B [[buffer(1)]], \
      device itype *C [[buffer(2)]], \
-      const constant int &M [[buffer(3)]], \
-      const constant int &N [[buffer(4)]], \
-      const constant int &K [[buffer(5)]], \
-      const constant int &batch_stride_a [[buffer(6)]], \
-      const constant int &batch_stride_b [[buffer(7)]], \
-      const constant int &batch_stride_c [[buffer(8)]], \
+      const constant GEMMParams* params [[buffer(3)]], \
      uint simd_lane_id [[thread_index_in_simdgroup]], \
      uint simd_group_id [[simdgroup_index_in_threadgroup]], \
      uint3 tid [[threadgroup_position_in_grid]], \
@@ -84,10 +80,10 @@ template <typename T,
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2) \
-    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 16, 2, 2)
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 2, 2)

 instantiate_gemm_shapes_helper(float16, half, float16, half);
-instantiate_gemm_shapes_helper(float32, float, float32, float);
 instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, bfloat16, bfloat16_t);

-// TODO: Accumulation in different type
+instantiate_gemm_shapes_helper(float32, float, float32, float);
--- a/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_addmm.metal
+++ b/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_addmm.metal
@@ -0,0 +1,260 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
+
+using namespace metal;
+using namespace mlx::steel;
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernels
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T,
+          int BM,
+          int BN,
+          int BK,
+          int WM,
+          int WN,
+          bool transpose_a, 
+          bool transpose_b,
+          bool MN_aligned,
+          bool K_aligned,
+          typename AccumType = float,
+          typename Epilogue = TransformAdd<T, AccumType>>
+[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void addmm(
+    const device T *A [[buffer(0)]],
+    const device T *B [[buffer(1)]],
+    const device T *C [[buffer(2)]],
+    device T *D [[buffer(3)]],
+    const constant GEMMAddMMParams* params [[buffer(4)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]) { 
+    
+    // Pacifying compiler
+    (void)lid;
+    
+    using gemm_kernel = 
+        GEMMKernel<T, T, BM, BN, BK, WM, WN, 
+        transpose_a, transpose_b, 
+        MN_aligned, K_aligned,
+        AccumType, Epilogue>;
+    
+    using loader_a_t = typename gemm_kernel::loader_a_t;
+    using loader_b_t = typename gemm_kernel::loader_b_t;
+    using mma_t = typename gemm_kernel::mma_t;
+    
+    threadgroup T As[gemm_kernel::tgp_mem_size_a];
+    threadgroup T Bs[gemm_kernel::tgp_mem_size_b];
+
+    // Adjust for batch
+    A += params->batch_stride_a * tid.z;
+    B += params->batch_stride_b * tid.z;
+    C += params->batch_stride_c * tid.z;
+    D += params->batch_stride_d * tid.z;
+
+    const int tid_y = ((tid.y) << params->swizzle_log) +
+        ((tid.x) & ((1 << params->swizzle_log) - 1));
+    const int tid_x = (tid.x) >> params->swizzle_log;
+
+    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
+      return;
+    }
+
+    threadgroup_barrier(mem_flags::mem_none);
+
+    // Find block in A, B, C
+    const int c_row = tid_y * BM;
+    const int c_col = tid_x * BN;
+
+    A += transpose_a ? c_row : c_row * params->lda;
+    B += transpose_b ? c_col * params->ldb : c_col;
+    C += c_row * params->ldc + c_col * params->fdc;
+    D += c_row * params->ldd + c_col;
+
+    // Prepare threadgroup loading operations
+    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
+    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);
+
+    // Prepare threadgroup mma operation
+    thread mma_t mma_op(simd_group_id, simd_lane_id);
+
+    int gemm_k_iterations = params->gemm_k_iterations_aligned;
+
+    const Epilogue epilogue_op(params->alpha, params->beta);
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // MNK aligned loop
+    if (MN_aligned) {
+      for (int k = 0; k < gemm_k_iterations; k++) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_unsafe();
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+
+      threadgroup_barrier(mem_flags::mem_none);
+
+      // Loop tail
+      if (!K_aligned) {
+        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
+        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
+        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);
+
+        thread bool mask_A[loader_a_t::n_rows][loader_a_t::vec_size];
+        thread bool mask_B[loader_b_t::n_rows][loader_b_t::vec_size];
+
+        loader_a.set_mask(tile_dims_A, mask_A);
+        loader_b.set_mask(tile_dims_B, mask_B);
+
+        loader_a.load_safe(mask_A);
+        loader_b.load_safe(mask_B);
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        mma_op.mma(As, Bs);
+      }
+
+      // Store results to device memory
+      mma_op.store_result(D, params->ldd, C, params->ldc, params->fdc, epilogue_op);
+      return;
+
+    }
+    ///////////////////////////////////////////////////////////////////////////////
+    // MN unaligned loop
+    else { // Loop over K - unaligned case
+      short tgp_bm = min(BM, params->M - c_row);
+      short tgp_bn = min(BN, params->N - c_col);
+      short leftover_bk = params->K - params->gemm_k_iterations_aligned * BK;
+
+      if (tgp_bm == BM && tgp_bn == BN) {
+        gemm_kernel::gemm_loop(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk,
+            LoopAlignment<true, true, K_aligned>{});
+
+        mma_op.store_result(D, params->ldd, C, params->ldc, params->fdc, epilogue_op);
+        return;
+
+      } else if (tgp_bn == BN) {
+        gemm_kernel::gemm_loop(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk,
+            LoopAlignment<false, true, K_aligned>{});
+
+        return mma_op.store_result_safe(
+            D, params->ldd, 
+            C, params->ldc, params->fdc,
+            short2(tgp_bn, tgp_bm), 
+            epilogue_op);
+
+      } else if (tgp_bm == BM) {
+        gemm_kernel::gemm_loop(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk,
+            LoopAlignment<true, false, K_aligned>{});
+
+        return mma_op.store_result_safe(
+            D, params->ldd, 
+            C, params->ldc, params->fdc,
+            short2(tgp_bn, tgp_bm), 
+            epilogue_op);
+
+      } else {
+        gemm_kernel::gemm_loop(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk,
+            LoopAlignment<false, false, K_aligned>{});
+
+        return mma_op.store_result_safe(
+            D, params->ldd, 
+            C, params->ldc, params->fdc,
+            short2(tgp_bn, tgp_bm), 
+            epilogue_op);
+      }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernel initializations
+///////////////////////////////////////////////////////////////////////////////
+
+#define instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned, ep_name, epilogue) \
+  template [[host_name("steel_addmm_" #tname "_"  #iname "_" #oname "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_MN_" #aname "_K_" #kname "_" #ep_name)]] \
+  [[kernel]] void addmm<itype, bm, bn, bk, wm, wn, trans_a, trans_b, mn_aligned, k_aligned, float, epilogue<itype, float>>( \
+      const device itype *A [[buffer(0)]], \
+      const device itype *B [[buffer(1)]], \
+      const device itype *C [[buffer(2)]], \
+      device itype *D [[buffer(3)]], \
+      const constant GEMMAddMMParams* params [[buffer(4)]], \
+      uint simd_lane_id [[thread_index_in_simdgroup]], \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]], \
+      uint3 tid [[threadgroup_position_in_grid]], \
+      uint3 lid [[thread_position_in_threadgroup]]);
+
+#define instantiate_gemm_bias_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned, add, TransformAdd) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned, axpby, TransformAxpby)
+
+#define instantiate_gemm_aligned_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+  instantiate_gemm_bias_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, taligned, true) \
+  instantiate_gemm_bias_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, naligned, false) \
+  instantiate_gemm_bias_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, taligned, true) \
+  instantiate_gemm_bias_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, naligned, false)
+
+#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)
+
+#define instantiate_gemm_shapes_helper(iname, itype, oname, otype) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 2, 2)
+
+instantiate_gemm_shapes_helper(float16, half, float16, half);
+instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, bfloat16, bfloat16_t);
+
+instantiate_gemm_shapes_helper(float32, float, float32, float);
--- a/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.metal
+++ b/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.metal
@@ -0,0 +1,280 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
+
+using namespace metal;
+using namespace mlx::steel;
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernels
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T,
+          typename U,
+          int BM,
+          int BN,
+          int BK,
+          int WM,
+          int WN,
+          bool transpose_a, 
+          bool transpose_b,
+          bool MN_aligned,
+          bool K_aligned>
+[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void gemm_splitk(
+    const device T *A [[buffer(0)]],
+    const device T *B [[buffer(1)]],
+    device U *C [[buffer(2)]],
+    const constant GEMMSpiltKParams* params [[buffer(3)]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]) { 
+
+    (void)lid;
+    
+    using gemm_kernel = GEMMKernel<T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned>;
+    using loader_a_t = typename gemm_kernel::loader_a_t;
+    using loader_b_t = typename gemm_kernel::loader_b_t;
+    using mma_t = typename gemm_kernel::mma_t;
+    
+    threadgroup T As[gemm_kernel::tgp_mem_size_a];
+    threadgroup T Bs[gemm_kernel::tgp_mem_size_b];
+
+    const int tid_x = tid.x;
+    const int tid_y = tid.y;
+    const int tid_z = tid.z;
+
+    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
+      return;
+    }
+
+    // Find block in A, B, C
+    const int c_row = tid_y * BM;
+    const int c_col = tid_x * BN;
+    const int k_start = params->split_k_partition_size * tid_z;
+
+    A += transpose_a ? (c_row + k_start * params->lda) : (k_start + c_row * params->lda);
+    B += transpose_b ? (k_start + c_col * params->ldb) : (c_col + k_start * params->ldb);
+    C += (params->split_k_partition_stride * tid_z) + (c_row * params->ldc + c_col);
+
+    // Prepare threadgroup loading operations
+    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
+    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);
+
+    // Prepare threadgroup mma operation
+    thread mma_t mma_op(simd_group_id, simd_lane_id);
+
+    int gemm_k_iterations = params->gemm_k_iterations_aligned;
+
+    short tgp_bm = min(BM, params->M - c_row);
+    short tgp_bn = min(BN, params->N - c_col);
+    short leftover_bk = params->K % BK;
+
+    if(MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
+      gemm_kernel::gemm_loop(
+          As,
+          Bs,
+          gemm_k_iterations,
+          loader_a,
+          loader_b,
+          mma_op,
+          tgp_bm,
+          tgp_bn,
+          leftover_bk,
+          LoopAlignment<true, true, true>{});
+    } else if (tgp_bn == BN) {
+      gemm_kernel::gemm_loop(
+        As,
+        Bs,
+        gemm_k_iterations,
+        loader_a,
+        loader_b,
+        mma_op,
+        tgp_bm,
+        tgp_bn,
+        leftover_bk,
+        LoopAlignment<false, true, true>{});
+    } else if (tgp_bm == BM) {
+      gemm_kernel::gemm_loop(
+          As,
+          Bs,
+          gemm_k_iterations,
+          loader_a,
+          loader_b,
+          mma_op,
+          tgp_bm,
+          tgp_bn,
+          leftover_bk,
+          LoopAlignment<true, false, true>{});
+    } else {
+      gemm_kernel::gemm_loop(
+          As,
+          Bs,
+          gemm_k_iterations,
+          loader_a,
+          loader_b,
+          mma_op,
+          tgp_bm,
+          tgp_bn,
+          leftover_bk,
+          LoopAlignment<false, false, true>{});
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if ((tid_z + 1) == (params->split_k_partitions)) {
+      int gemm_k_iter_remaining = (params->K - (k_start + params->split_k_partition_size)) / BK;
+      if(!K_aligned || gemm_k_iter_remaining > 0)
+      gemm_kernel::gemm_loop(
+          As,
+          Bs,
+          gemm_k_iter_remaining,
+          loader_a,
+          loader_b,
+          mma_op,
+          tgp_bm,
+          tgp_bn,
+          leftover_bk,
+          LoopAlignment<false, false, K_aligned>{});
+    }
+
+    if(MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
+      mma_op.store_result(C, params->ldc);
+    } else {
+      mma_op.store_result_safe(C, params->ldc, short2(tgp_bn, tgp_bm));
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernel initializations
+///////////////////////////////////////////////////////////////////////////////
+
+#define instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned) \
+  template [[host_name("steel_gemm_splitk_" #tname "_"  #iname "_" #oname "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_MN_" #aname "_K_" #kname)]] \
+  [[kernel]] void gemm_splitk<itype, otype, bm, bn, bk, wm, wn, trans_a, trans_b, mn_aligned, k_aligned>( \
+      const device itype *A [[buffer(0)]], \
+      const device itype *B [[buffer(1)]], \
+      device otype *C [[buffer(2)]], \
+      const constant GEMMSpiltKParams* params [[buffer(3)]], \
+      uint simd_lane_id [[thread_index_in_simdgroup]], \
+      uint simd_group_id [[simdgroup_index_in_threadgroup]], \
+      uint3 tid [[threadgroup_position_in_grid]], \
+      uint3 lid [[thread_position_in_threadgroup]]);
+
+#define instantiate_gemm_aligned_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, taligned, true) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, naligned, false) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, taligned, true) \
+  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, naligned, false)
+
+#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
+    instantiate_gemm_aligned_helper(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)
+
+#define instantiate_gemm_shapes_helper(iname, itype, oname, otype) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 16, 16, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 16, 32, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 16, 16, 2, 2) \
+    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2)
+
+instantiate_gemm_shapes_helper(float16, half, float32, float);
+instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, float32, float);
+
+instantiate_gemm_shapes_helper(float32, float, float32, float);
+
+///////////////////////////////////////////////////////////////////////////////
+// Split k accumulation kernel 
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AccT,
+          typename OutT,
+          typename Epilogue = TransformNone<OutT, AccT>>
+[[kernel]] void gemm_splitk_accum(
+    const device AccT *C_split [[buffer(0)]],
+    device OutT *D [[buffer(1)]],
+    const constant int& k_partitions [[buffer(2)]],
+    const constant int& partition_stride [[buffer(3)]],
+    const constant int& ldd [[buffer(4)]],
+    uint2 gid [[thread_position_in_grid]]) {
+
+  // Ajust D and C
+  D += gid.x + gid.y * ldd;
+  C_split += gid.x + gid.y * ldd;
+
+  int offset = 0;
+  AccT out = 0;
+
+  for(int i = 0; i < k_partitions; i++) {
+    out += C_split[offset];
+    offset += partition_stride;
+  }
+
+  // Write output 
+  D[0] = Epilogue::apply(out);
+
+}
+
+template <typename AccT,
+          typename OutT,
+          typename Epilogue = TransformAxpby<OutT, AccT>>
+[[kernel]] void gemm_splitk_accum_axpby(
+    const device AccT *C_split [[buffer(0)]],
+    device OutT *D [[buffer(1)]],
+    const constant int& k_partitions [[buffer(2)]],
+    const constant int& partition_stride [[buffer(3)]],
+    const constant int& ldd [[buffer(4)]],
+    const device OutT *C [[buffer(5)]],
+    const constant int& ldc [[buffer(6)]],
+    const constant int& fdc [[buffer(7)]],
+    const constant float& alpha [[buffer(8)]],
+    const constant float& beta [[buffer(9)]],
+    uint2 gid [[thread_position_in_grid]]) {
+
+  // Ajust D and C
+  C += gid.x * fdc + gid.y * ldc;
+  D += gid.x + gid.y * ldd;
+  C_split += gid.x + gid.y * ldd;
+
+  int offset = 0;
+  AccT out = 0;
+
+  for(int i = 0; i < k_partitions; i++) {
+    out += C_split[offset];
+    offset += partition_stride;
+  }
+
+  // Write output 
+  Epilogue op(alpha, beta);
+  D[0] = op.apply(out, *C);
+
+}
+
+#define instantiate_accum(oname, otype, aname, atype) \
+  template [[host_name("steel_gemm_splitk_accum_" #oname "_"  #aname)]] \
+  [[kernel]] void gemm_splitk_accum<atype, otype>(                                    \
+      const device atype *C_split [[buffer(0)]],                         \
+      device otype *D [[buffer(1)]],                                     \
+      const constant int& k_partitions [[buffer(2)]],                   \
+      const constant int& partition_stride [[buffer(3)]],               \
+      const constant int& ldd [[buffer(4)]],                            \
+      uint2 gid [[thread_position_in_grid]]);                         \
+  template [[host_name("steel_gemm_splitk_accum_" #oname "_"  #aname "_axpby")]] \
+  [[kernel]] void gemm_splitk_accum_axpby<atype, otype>( \
+      const device atype *C_split [[buffer(0)]], \
+      device otype *D [[buffer(1)]], \
+      const constant int& k_partitions [[buffer(2)]], \
+      const constant int& partition_stride [[buffer(3)]], \
+      const constant int& ldd [[buffer(4)]], \
+      const device otype *C [[buffer(5)]],  \
+      const constant int& ldc [[buffer(6)]], \
+      const constant int& fdc [[buffer(7)]], \
+      const constant float& alpha [[buffer(8)]], \
+      const constant float& beta [[buffer(9)]], \
+      uint2 gid [[thread_position_in_grid]]);
+
+instantiate_accum(bfloat16, bfloat16_t, float32, float);
+instantiate_accum(float16, half, float32, float);
+instantiate_accum(float32, float, float32, float);
--- a/mlx/backend/metal/kernels/steel/gemm/loader.h
+++ b/mlx/backend/metal/kernels/steel/gemm/loader.h
@@ -0,0 +1,160 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Loading helper
+///////////////////////////////////////////////////////////////////////////////
+
+namespace mlx {
+namespace steel {
+
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short dst_ld,
+    short reduction_dim,
+    short tgp_size,
+    short alignment = 1,
+    short n_reads = (BCOLS * BROWS) / (tgp_size),
+    short TCOLS = BCOLS / n_reads,
+    short TROWS = tgp_size / TCOLS>
+struct BlockLoader {
+  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
+  STEEL_CONST short vec_size = n_reads;
+
+  // Leading dimension for src
+  const int src_ld;
+  const int tile_stride;
+
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+
+  struct alignas(alignment * sizeof(T)) ReadVector {
+    uint8_t v[sizeof(T) * vec_size];
+  };
+
+  /* Constructor */
+  METAL_FUNC BlockLoader(
+      const device T* src_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj) {}
+
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
+          *((const device ReadVector*)(&src[i * src_ld]));
+    }
+  }
+
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void set_mask(
+      thread const short2& src_tile_dims,
+      thread bool mask[n_rows][vec_size]) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        mask[i][j] =
+            ((bi + i) < src_tile_dims.y) && ((bj + j) < src_tile_dims.x);
+      }
+    }
+  }
+
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(short2 src_tile_dim) const {
+    src_tile_dim = src_tile_dim - short2(bj, bi);
+
+    // Use fast thread memory for bound checks
+    bool tmp_idx[vec_size];
+    T tmp_val[vec_size];
+
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      // Make sure tmp_idx only contains valid indices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
+      }
+
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
+      }
+
+      // Zero out uneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
+      }
+
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = tmp_val[j];
+      }
+    }
+  }
+
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(const thread bool mask[n_rows][vec_size]) const {
+    T tmp_val[vec_size];
+
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, ii = 0; i < BROWS; i += TROWS, ii++) {
+      simdgroup_barrier(mem_flags::mem_none);
+      // Use fast thread memory for bound checks
+
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(mask[ii][j] ? i * src_ld + j : 0)];
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Zero out uneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = mask[ii][j] ? tmp_val[j] : T(0);
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = tmp_val[j];
+      }
+    }
+  }
+
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    src += tile_stride;
+  }
+};
+
+} // namespace steel
+} // namespace mlx
--- a/mlx/backend/metal/kernels/steel/gemm/mma.h
+++ b/mlx/backend/metal/kernels/steel/gemm/mma.h
@@ -0,0 +1,264 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// MMA helper
+///////////////////////////////////////////////////////////////////////////////
+
+namespace mlx {
+namespace steel {
+
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    short lda_tgp,
+    short ldb_tgp,
+    typename AccumType = float,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct BlockMMA {
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TM_stride = 8 * WM;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TN_stride = 8 * WN;
+
+  // Warp tile size along M
+  STEEL_CONST short TM = BM / TM_stride;
+  // Warp tile size along N
+  STEEL_CONST short TN = BN / TN_stride;
+
+  // Strides of A, B along reduction axis
+  STEEL_CONST short simd_stride_a = {
+      transpose_a ? TM_stride : TM_stride * lda_tgp};
+  STEEL_CONST short simd_stride_b = {
+      transpose_b ? TN_stride * ldb_tgp : TN_stride};
+
+  // Jump between elements
+  STEEL_CONST short jump_a = {transpose_a ? lda_tgp : 1};
+  STEEL_CONST short jump_b = {transpose_b ? ldb_tgp : 1};
+
+  STEEL_CONST short tile_stride_a = {transpose_a ? 8 * lda_tgp : 8};
+  STEEL_CONST short tile_stride_b = {transpose_b ? 8 : 8 * ldb_tgp};
+
+  // Simdgroup matrices
+  simdgroup_matrix<AccumType, 8, 8> Asimd[TM];
+  simdgroup_matrix<AccumType, 8, 8> Bsimd[TN];
+  simdgroup_matrix<AccumType, 8, 8> results[TM * TN] = {
+      simdgroup_matrix<AccumType, 8, 8>(0)};
+
+  // Offsets within threadgroup
+  const short tm;
+  const short tn;
+
+  short sm;
+  short sn;
+
+  short As_offset;
+  short Bs_offset;
+
+  /* Constructor */
+  METAL_FUNC BlockMMA(
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : tm(8 * (simd_group_id / WN)), tn(8 * (simd_group_id % WN)) {
+    // Determine thread position in simdgroup matrix
+    short qid = simd_lane_id / 4;
+    sm = (qid & 4) + (simd_lane_id / 2) % 4;
+    sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
+
+    // Determine thread and simdgroup offset
+    As_offset =
+        transpose_a ? ((sn)*lda_tgp + (tm + sm)) : ((sn) + (tm + sm) * lda_tgp);
+    Bs_offset =
+        transpose_b ? ((tn + sn) * ldb_tgp + (sm)) : ((sm)*ldb_tgp + (tn + sn));
+  }
+
+  /* (BM, BK) X (BK, BN) multiply accumulate function */
+  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
+    // Adjust for simdgroup and thread location
+    As += As_offset;
+    Bs += Bs_offset;
+
+    // Iterate over BK in blocks of 8
+    STEEL_PRAGMA_UNROLL
+    for (short kk = 0; kk < BK; kk += 8) {
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Load elements from threadgroup A as simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < TM; i++) {
+        Asimd[i].thread_elements()[0] =
+            static_cast<AccumType>(As[i * simd_stride_a + 0]);
+        Asimd[i].thread_elements()[1] =
+            static_cast<AccumType>(As[i * simd_stride_a + jump_a]);
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Load elements from threadgroup B as simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        Bsimd[j].thread_elements()[0] =
+            static_cast<AccumType>(Bs[j * simd_stride_b + 0]);
+        Bsimd[j].thread_elements()[1] =
+            static_cast<AccumType>(Bs[j * simd_stride_b + jump_b]);
+      }
+
+      simdgroup_barrier(mem_flags::mem_none);
+
+      // Multiply and accumulate into result simdgroup matrices
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < TM; i++) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < TN; j++) {
+          short j_serp = (i % 2) ? (TN - 1 - j) : j;
+
+          simdgroup_multiply_accumulate(
+              results[i * TN + j_serp],
+              Asimd[i],
+              Bsimd[j_serp],
+              results[i * TN + j_serp]);
+        }
+      }
+
+      // Progress to next simdgroup tile
+      As += tile_stride_a;
+      Bs += tile_stride_b;
+    }
+  }
+
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(device U* C, const int ldc) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + tn + sn;
+
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = results[i * TN + j].thread_elements();
+        int offset = (i * TM_stride) * ldc + (j * TN_stride);
+
+        // Apply epilogue
+        U outs[2] = {Epilogue::apply(accum[0]), Epilogue::apply(accum[1])};
+
+        // Write out C
+        C[offset] = outs[0];
+        C[offset + 1] = outs[1];
+      }
+    }
+  }
+
+  METAL_FUNC void
+  store_result_safe(device U* C, const int ldc, short2 dst_tile_dims) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn);
+    dst_tile_dims -= short2(tn + sn, sm + tm);
+
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = results[i * TN + j].thread_elements();
+          int offset = (i * TM_stride) * ldc + (j * TN_stride);
+
+          // Apply epilogue and output C
+          if (j * TN_stride < dst_tile_dims.x) {
+            C[offset] = Epilogue::apply(accum[0]);
+          }
+
+          if (j * TN_stride + 1 < dst_tile_dims.x) {
+            C[offset + 1] = Epilogue::apply(accum[1]);
+          }
+        }
+      }
+    }
+  }
+
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn) * fdc;
+    D += (sm + tm) * ldd + tn + sn;
+
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = results[i * TN + j].thread_elements();
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+
+        // Apply epilogue
+        U outs[2] = {
+            epilogue_op.apply(accum[0], C[offset_c]),
+            epilogue_op.apply(accum[1], C[offset_c + fdc])};
+
+        // Write out D
+        D[offset_d] = outs[0];
+        D[offset_d + 1] = outs[1];
+      }
+    }
+  }
+
+  METAL_FUNC void store_result_safe(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm + tm) * ldc + (tn + sn) * fdc;
+    D += (sm + tm) * ldd + tn + sn;
+    dst_tile_dims -= short2(tn + sn, sm + tm);
+
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = results[i * TN + j].thread_elements();
+          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+
+          // Apply epilogue and output C
+          if (j * TN_stride < dst_tile_dims.x) {
+            D[offset_d] = epilogue_op.apply(accum[0], C[offset_c]);
+          }
+
+          if (j * TN_stride + 1 < dst_tile_dims.x) {
+            D[offset_d + 1] = epilogue_op.apply(accum[1], C[offset_c + fdc]);
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace steel
+} // namespace mlx
--- a/mlx/backend/metal/kernels/steel/gemm/params.h
+++ b/mlx/backend/metal/kernels/steel/gemm/params.h
@@ -0,0 +1,79 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+// GEMM param classes
+///////////////////////////////////////////////////////////////////////////////
+
+namespace mlx {
+namespace steel {
+
+struct GEMMParams {
+  const int M;
+  const int N;
+  const int K;
+
+  const int lda;
+  const int ldb;
+  const int ldc;
+
+  const int tiles_n;
+  const int tiles_m;
+
+  const int batch_stride_a;
+  const int batch_stride_b;
+  const int batch_stride_c;
+
+  const int swizzle_log;
+  const int gemm_k_iterations_aligned;
+};
+
+struct GEMMSpiltKParams {
+  const int M;
+  const int N;
+  const int K;
+
+  const int lda;
+  const int ldb;
+  const int ldc;
+
+  const int tiles_n;
+  const int tiles_m;
+
+  const int split_k_partitions;
+  const int split_k_partition_stride;
+  const int split_k_partition_size;
+
+  const int gemm_k_iterations_aligned;
+};
+
+struct GEMMAddMMParams {
+  const int M;
+  const int N;
+  const int K;
+
+  const int lda;
+  const int ldb;
+  const int ldc;
+  const int ldd;
+
+  const int tiles_n;
+  const int tiles_m;
+
+  const int batch_stride_a;
+  const int batch_stride_b;
+  const int batch_stride_c;
+  const int batch_stride_d;
+
+  const int swizzle_log;
+  const int gemm_k_iterations_aligned;
+
+  const float alpha;
+  const float beta;
+
+  const int fdc;
+};
+
+} // namespace steel
+} // namespace mlx
--- a/mlx/backend/metal/kernels/steel/gemm/transforms.h
+++ b/mlx/backend/metal/kernels/steel/gemm/transforms.h
@@ -0,0 +1,63 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/steel/utils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Transforms and Epilogues
+///////////////////////////////////////////////////////////////////////////////
+
+namespace mlx {
+namespace steel {
+
+template <typename OutT, typename InT>
+struct TransformNone {
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+
+  static METAL_FUNC OutT apply(InT x, OutT) {
+    return static_cast<OutT>(x);
+  }
+};
+
+template <typename OutT, typename InT>
+struct TransformAdd {
+  TransformAdd(const float, const float) {}
+
+  static METAL_FUNC OutT apply(InT x, OutT c) {
+    return static_cast<OutT>(x) + c;
+  }
+};
+
+template <typename OutT, typename InT>
+struct TransformAxpby {
+  const float alpha;
+  const float beta;
+
+  TransformAxpby(const float alpha_, const float beta_)
+      : alpha(alpha_), beta(beta_) {}
+
+  METAL_FUNC OutT apply(InT x, OutT c) const {
+    return static_cast<OutT>(x * alpha + (beta * c));
+  }
+};
+
+template <typename T>
+struct AccumHelper {
+  typedef float accum_type;
+};
+
+struct BlockSwizzle {
+  static METAL_FUNC int2
+  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
+    const int tid_x = (tid.x) >> swizzle_log;
+    const int tid_y =
+        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
+    return int2(tid_x, tid_y);
+  }
+};
+
+} // namespace steel
+} // namespace mlx
--- a/mlx/backend/metal/kernels/steel/host.h
+++ b/mlx/backend/metal/kernels/steel/host.h
@@ -0,0 +1,5 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/metal/kernels/steel/gemm/params.h"
--- a/mlx/backend/metal/kernels/steel/utils.h
+++ b/mlx/backend/metal/kernels/steel/utils.h
@@ -0,0 +1,9 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <metal_stdlib>
+#include "mlx/backend/metal/kernels/steel/host.h"
+
+#define STEEL_CONST static constant constexpr const
+#define STEEL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
--- a/mlx/backend/metal/kernels/unary.metal
+++ b/mlx/backend/metal/kernels/unary.metal
@@ -134,8 +134,8 @@ struct Negative {
 };

 struct Round {
-  template <typename T> T operator()(T x) { return metal::round(x); };
-  template <> complex64_t operator()(complex64_t x) { return {metal::round(x.real), metal::round(x.imag)}; };
+  template <typename T> T operator()(T x) { return metal::rint(x); };
+  template <> complex64_t operator()(complex64_t x) { return {metal::rint(x.real), metal::rint(x.imag)}; };
 };

 struct Sigmoid {
--- a/mlx/backend/metal/kernels/utils.h
+++ b/mlx/backend/metal/kernels/utils.h
@@ -235,12 +235,42 @@ inline size_t ceildiv(size_t N, size_t M) {
 // https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202
 inline float log1p(float x) {
  float xp1 = 1.0f + x;
-  return (xp1 == 1.0f) ? x : x * (metal::log(xp1) / (xp1 - 1.0f));
+  if (xp1 == Limits<float>::max) {
+    return Limits<float>::max;
+  }
+  if (xp1 == 1.0f) {
+    return x;
+  }
+
+  return x * (metal::log(xp1) / (xp1 - 1.0f));
 }

 inline bfloat16_t log1p(bfloat16_t x) {
  float xp1 = 1.0f + static_cast<float>(x);
-  bfloat16_t ret =
-      (xp1 == 1.0f) ? x : bfloat16_t(x * (metal::log(xp1) / (xp1 - 1.0f)));
-  return ret;
+  if (xp1 == Limits<float>::max) {
+    return Limits<bfloat16_t>::max;
+  }
+  if (xp1 == 1.0f) {
+    return x;
+  }
+
+  return bfloat16_t(x * (metal::log(xp1) / (xp1 - 1.0f)));
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// SIMD shuffle ops
+///////////////////////////////////////////////////////////////////////////////
+
+inline uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {
+  return as_type<uint64_t>(
+      metal::simd_shuffle_down(as_type<uint2>(data), delta));
+}
+
+inline int64_t simd_shuffle_down(int64_t data, uint16_t delta) {
+  return as_type<int64_t>(
+      metal::simd_shuffle_down(as_type<uint2>(data), delta));
+}
+
+inline bool simd_shuffle_down(bool data, uint16_t delta) {
+  return simd_shuffle_down(static_cast<uint32_t>(data), delta);
+}
--- a/mlx/backend/metal/matmul.cpp
+++ b/mlx/backend/metal/matmul.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <algorithm>
 #include <cassert>
@@ -8,6 +8,7 @@
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels/steel/host.h"
 #include "mlx/backend/metal/matmul.h"
 #include "mlx/backend/metal/mps/gemm.h"
 #include "mlx/backend/metal/utils.h"
@@ -16,6 +17,10 @@

 namespace mlx::core {

+///////////////////////////////////////////////////////////////////////////////
+// MPS Matmul fallback
+///////////////////////////////////////////////////////////////////////////////
+
 namespace {

 bool use_mps() {
@@ -46,7 +51,9 @@ inline void mps_matmul(
    int ldb,
    bool transpose_a,
    bool transpose_b,
-    std::vector<array>& copies) {
+    std::vector<array>& copies,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
  MPS::DataType mps_dtype = MPS::DataTypeFloat32;

  if (out.dtype() == float16) {
@@ -121,7 +128,7 @@ inline void mps_matmul(
        auto out_mat = MPS::Matrix::alloc()->init(out_buf, out_desc);

        auto kernel = MPS::MatrixMultiplication::alloc()->init(
-            d.mtl_device(), transpose_a, transpose_b, M, N, K, 1.0, 0.0);
+            d.mtl_device(), transpose_a, transpose_b, M, N, K, alpha, beta);

        auto command_buffer = d.get_command_buffer(s.index);
        kernel->setBatchSize(batch_size_out);
@@ -162,7 +169,7 @@ inline void mps_matmul(
  auto out_mat = MPS::Matrix::alloc()->init(out_buf, out_desc);

  auto kernel = MPS::MatrixMultiplication::alloc()->init(
-      d.mtl_device(), transpose_a, transpose_b, M, N, K, 1.0, 0.0);
+      d.mtl_device(), transpose_a, transpose_b, M, N, K, alpha, beta);

  auto command_buffer = d.get_command_buffer(s.index);
  for (int i = 0; i < batch_size_out; ++i) {
@@ -186,7 +193,11 @@ inline void mps_matmul(

 } // namespace

-void mlx_matmul(
+///////////////////////////////////////////////////////////////////////////////
+// Steel matmul fallback
+///////////////////////////////////////////////////////////////////////////////
+
+void steel_matmul(
    const Stream& s,
    metal::Device& d,
    const array& a,
@@ -201,6 +212,15 @@ void mlx_matmul(
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies) {
+  using namespace mlx::steel;
+
+  // Coalesce (B, M, K) X (K, N) to (B*M, K) X (K, N)
+  if (batch_size_out > 1 && !transpose_a &&
+      a.data_size() == batch_size_out * M * K && b.size() == K * N) {
+    M = M * batch_size_out;
+    batch_size_out = 1;
+  }
+
  // Account for batch sizes and basic broadcasting
  int batch_size_a = a.data_size() / (M * K);
  int batch_size_b = b.data_size() / (K * N);
@@ -209,11 +229,108 @@ void mlx_matmul(
  int matrix_stride_b = (batch_size_b == 1) ? 0 : K * N;
  int matrix_stride_out = M * N;

+  /////////////////////////////////////////////////////////////////////////////
+  // Split K specialization
+
+  int _tm = M / 16;
+  int _tn = N / 16;
+  int _tk = K / 16;
+
+  if (batch_size_out == 1 && (_tm * _tn) <= 32 && _tk >= 8) {
+    int bm = M < 40 ? 16 : 32;
+    int bn = N < 40 ? 16 : 32;
+    int bk = 16;
+    int wm = 2, wn = 2;
+
+    int split_k_partitions =
+        _tk < 16 ? 2 : (_tk < 32 ? 4 : (_tk < 64 ? 8 : 16));
+    int split_k_partition_stride = M * N;
+    int gemm_k_iterations = (K / bk) / split_k_partitions;
+    int split_k_partition_size = gemm_k_iterations * bk;
+
+    array C_split({split_k_partitions, M, N}, float32, nullptr, {});
+    C_split.set_data(allocator::malloc_or_wait(C_split.nbytes()));
+    copies.push_back(C_split);
+
+    std::ostringstream kname;
+    kname << "steel_gemm_splitk_" << (transpose_a ? 't' : 'n')
+          << (transpose_b ? 't' : 'n') << "_" << type_to_name(a) << "_"
+          << type_to_name(C_split) << "_bm" << bm << "_bn" << bn << "_bk" << bk
+          << "_wm" << wm << "_wn" << wn << "_MN_"
+          << ((M % bm == 0 && N % bn == 0) ? "t" : "n") << "aligned"
+          << "_K_" << ((K % bk == 0) ? "t" : "n") << "aligned";
+
+    // Encode and dispatch gemm kernel
+    auto compute_encoder = d.get_command_encoder(s.index);
+    auto kernel = d.get_kernel(kname.str());
+    compute_encoder->setComputePipelineState(kernel);
+
+    int tn = (N + bn - 1) / bn;
+    int tm = (M + bm - 1) / bm;
+
+    GEMMSpiltKParams params{
+        M,
+        N,
+        K,
+        lda,
+        ldb,
+        N,
+        tn,
+        tm,
+        split_k_partitions,
+        split_k_partition_stride,
+        split_k_partition_size,
+        gemm_k_iterations};
+
+    MTL::Size group_dims = MTL::Size(32, wn, wm);
+    MTL::Size grid_dims = MTL::Size(tn, tm, split_k_partitions);
+
+    set_array_buffer(compute_encoder, a, 0);
+    set_array_buffer(compute_encoder, b, 1);
+    set_array_buffer(compute_encoder, C_split, 2);
+
+    compute_encoder->setBytes(&params, sizeof(GEMMSpiltKParams), 3);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+
+    // Do accum kernel
+    {
+      auto c_split_buf =
+          static_cast<const MTL::Resource*>(C_split.buffer().ptr());
+      const class MTL::Resource* const resources[1] = {c_split_buf};
+      compute_encoder->memoryBarrier(resources, 1);
+
+      auto kernel = d.get_kernel(
+          "steel_gemm_splitk_accum_" + type_to_name(out) + "_" +
+          type_to_name(C_split));
+      compute_encoder->setComputePipelineState(kernel);
+
+      // Set the arguments for the kernel
+      set_array_buffer(compute_encoder, C_split, 0);
+      set_array_buffer(compute_encoder, out, 1);
+      compute_encoder->setBytes(&split_k_partitions, sizeof(int), 2);
+      compute_encoder->setBytes(&split_k_partition_stride, sizeof(int), 3);
+      compute_encoder->setBytes(&N, sizeof(int), 4);
+
+      // Launch enough thread groups for each output
+      MTL::Size grid_dims = MTL::Size(N, M, 1);
+      MTL::Size group_dims = MTL::Size(std::min(1024, N * M), 1, 1);
+
+      compute_encoder->dispatchThreads(grid_dims, group_dims);
+    }
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+    return;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Regular kernel dispatch
+
  // Determine dispatch kernel
  int bm = 32, bn = 32, bk = 16;
  int wm = 2, wn = 2;

-  if ((size_t)batch_size_out * M * N >= 2ul << 20) {
+  if ((size_t)batch_size_out * M * N >= 1ul << 20) {
    if (!transpose_a && transpose_b) {
      bm = 64;
      bn = (out.dtype() == float32) ? 64 : 32;
@@ -224,10 +341,12 @@ void mlx_matmul(
    }
  }

+  // Prepare kernel name
  std::ostringstream kname;
-  kname << "gemm_" << (transpose_a ? 't' : 'n') << (transpose_b ? 't' : 'n')
-        << "_" << type_to_name(a) << "_" << type_to_name(out) << "_bm" << bm
-        << "_bn" << bn << "_bk" << bk << "_wm" << wm << "_wn" << wn << "_MN_"
+  kname << "steel_gemm_" << (transpose_a ? 't' : 'n')
+        << (transpose_b ? 't' : 'n') << "_" << type_to_name(a) << "_"
+        << type_to_name(out) << "_bm" << bm << "_bn" << bn << "_bk" << bk
+        << "_wm" << wm << "_wn" << wn << "_MN_"
        << ((M % bm == 0 && N % bn == 0) ? "t" : "n") << "aligned"
        << "_K_" << ((K % bk == 0) ? "t" : "n") << "aligned";

@@ -236,34 +355,55 @@ void mlx_matmul(
  auto kernel = d.get_kernel(kname.str());
  compute_encoder->setComputePipelineState(kernel);

+  // Use problem size to determine threadblock swizzle
+  int tn = (N + bn - 1) / bn;
+  int tm = (M + bm - 1) / bm;
+
+  // TODO: Explore device-based tuning for swizzle
+  int swizzle_log = 0; // tm >= 6 ? 3 : (tm <= 3 ? 0 : 2);
+
+  // Prepare steel matmul params
+  GEMMParams params{
+      M,
+      N,
+      K,
+      lda,
+      ldb,
+      N,
+      tn,
+      tm,
+      matrix_stride_a,
+      matrix_stride_b,
+      matrix_stride_out,
+      swizzle_log,
+      (K / bk)};
+
+  // Prepare launch grid params
+  int tile = 1 << swizzle_log;
+  tm = (tm + tile - 1) / tile;
+  tn = tn * tile;
+
+  MTL::Size group_dims = MTL::Size(32, wn, wm);
+  MTL::Size grid_dims = MTL::Size(tn, tm, batch_size_out);
+
  // Launch only 1 kernel in the case of simple batching / broadcasting
  if (batch_size_out == std::max(batch_size_a, batch_size_b) &&
      (batch_size_a == batch_size_b ||
       std::min(batch_size_a, batch_size_b) == 1)) {
-    MTL::Size group_dims = MTL::Size(32, wn, wm);
-    MTL::Size grid_dims =
-        MTL::Size((N + bn - 1) / bn, (M + bm - 1) / bm, batch_size_out);
-
    set_array_buffer(compute_encoder, a, 0);
    set_array_buffer(compute_encoder, b, 1);
    set_array_buffer(compute_encoder, out, 2);

-    compute_encoder->setBytes(&M, sizeof(int), 3);
-    compute_encoder->setBytes(&N, sizeof(int), 4);
-    compute_encoder->setBytes(&K, sizeof(int), 5);
-    compute_encoder->setBytes(&matrix_stride_a, sizeof(int), 6);
-    compute_encoder->setBytes(&matrix_stride_b, sizeof(int), 7);
-    compute_encoder->setBytes(&matrix_stride_out, sizeof(int), 8);
+    compute_encoder->setBytes(&params, sizeof(GEMMParams), 3);
    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
-  } else { // Other launch kernels with set offsets
+  } else { // Otherwise launch kernels with set offsets
+
+    MTL::Size grid_dims_single = MTL::Size(tn, tm, 1);

    for (int i = 0; i < batch_size_out; ++i) {
      auto a_off = elem_to_loc(M * K * i, a.shape(), a.strides());
      auto b_off = elem_to_loc(K * N * i, b.shape(), b.strides());

-      MTL::Size group_dims = MTL::Size(32, wn, wm);
-      MTL::Size grid_dims = MTL::Size((N + bn - 1) / bn, (M + bm - 1) / bm, 1);
-
      auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
      auto b_buf = static_cast<const MTL::Buffer*>(b.buffer().ptr());
      auto out_buf = static_cast<const MTL::Buffer*>(out.buffer().ptr());
@@ -272,13 +412,8 @@ void mlx_matmul(
      compute_encoder->setBuffer(b_buf, b_off * b.itemsize(), 1);
      compute_encoder->setBuffer(out_buf, i * M * N * out.itemsize(), 2);

-      compute_encoder->setBytes(&M, sizeof(int), 3);
-      compute_encoder->setBytes(&N, sizeof(int), 4);
-      compute_encoder->setBytes(&K, sizeof(int), 5);
-      compute_encoder->setBytes(&matrix_stride_a, sizeof(int), 6);
-      compute_encoder->setBytes(&matrix_stride_b, sizeof(int), 7);
-      compute_encoder->setBytes(&matrix_stride_out, sizeof(int), 8);
-      compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+      compute_encoder->setBytes(&params, sizeof(GEMMParams), 3);
+      compute_encoder->dispatchThreadgroups(grid_dims_single, group_dims);
    }
  }

@@ -300,6 +435,9 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];

+  /////////////////////////////////////////////////////////////////////////////
+  // Init checks and prep
+
  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
@@ -328,6 +466,9 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {

  auto batch_size_out = out.size() / (M * N);

+  /////////////////////////////////////////////////////////////////////////////
+  // Gemv specialization
+
  // Route to gemv if needed
  if (std::min(M, N) == 1) {
    // Collect problem info
@@ -433,10 +574,13 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    return;
  }

-  d.end_encoding(s.index);
+  /////////////////////////////////////////////////////////////////////////////
+  // Gemm specialization

  if (use_mps()) {
-    mps_matmul(
+    d.end_encoding(s.index);
+
+    return mps_matmul(
        s,
        d,
        a,
@@ -451,10 +595,9 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
        a_transposed,
        b_transposed,
        copies);
-    return;
  }

-  mlx_matmul(
+  return steel_matmul(
      s,
      d,
      a,
@@ -471,4 +614,266 @@ void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      copies);
 }

+void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 3);
+  if (!is_floating_point(out.dtype())) {
+    throw std::runtime_error(
+        "[matmul] Does not yet support non-floating point types.");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+
+  auto& a_pre = inputs[0];
+  auto& b_pre = inputs[1];
+  auto& c_pre = inputs[2];
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Init checks and prep
+
+  // Keep a vector with copies to be cleared in the completed buffer to release
+  // the arrays
+  std::vector<array> copies;
+  auto check_transpose = [&copies, &s](const array& arr) {
+    auto stx = arr.strides()[arr.ndim() - 2];
+    auto sty = arr.strides()[arr.ndim() - 1];
+    if (stx == arr.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, arr);
+    } else if (stx == 1 && sty == arr.shape(-2)) {
+      return std::make_tuple(true, sty, arr);
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy_gpu(arr, arr_copy, CopyType::General, s);
+      copies.push_back(arr_copy);
+      size_t stx = arr.shape(-1);
+      return std::make_tuple(false, stx, arr_copy);
+    }
+  };
+
+  auto [transpose_a, a_cols, a] = check_transpose(a_pre);
+  auto [transpose_b, b_cols, b] = check_transpose(b_pre);
+
+  int M = a.shape(-2);
+  int N = b.shape(-1);
+  int K = a.shape(-1);
+
+  auto batch_size_out = out.size() / (M * N);
+
+  array c = c_pre;
+  int ldc = c.strides()[c.ndim() - 2];
+  int fdc = c.strides()[c.ndim() - 1];
+  int matrix_stride_c = c.ndim() <= 2 ? 0 : c.strides()[c.ndim() - 3];
+
+  int lda = a_cols;
+  int ldb = b_cols;
+
+  using namespace mlx::steel;
+
+  // Account for batch sizes and basic broadcasting
+  int batch_size_a = a.data_size() / (M * K);
+  int batch_size_b = b.data_size() / (K * N);
+
+  int matrix_stride_a = (batch_size_a == 1) ? 0 : M * K;
+  int matrix_stride_b = (batch_size_b == 1) ? 0 : K * N;
+  int matrix_stride_out = M * N;
+
+  int _tm = M / 16;
+  int _tn = N / 16;
+  int _tk = K / 16;
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Split K specialization
+
+  if (batch_size_out == 1 && (_tm * _tn) <= 32 && _tk >= 8) {
+    int bm = M < 40 ? 16 : 32;
+    int bn = N < 40 ? 16 : 32;
+    int bk = 16;
+    int wm = 2, wn = 2;
+
+    int split_k_partitions =
+        _tk < 16 ? 2 : (_tk < 32 ? 4 : (_tk < 64 ? 8 : 16));
+    int split_k_partition_stride = M * N;
+    int gemm_k_iterations = (K / bk) / split_k_partitions;
+    int split_k_partition_size = gemm_k_iterations * bk;
+
+    array C_split({split_k_partitions, M, N}, float32, nullptr, {});
+    C_split.set_data(allocator::malloc_or_wait(C_split.nbytes()));
+    copies.push_back(C_split);
+
+    std::ostringstream kname;
+    kname << "steel_gemm_splitk_" << (transpose_a ? 't' : 'n')
+          << (transpose_b ? 't' : 'n') << "_" << type_to_name(a) << "_"
+          << type_to_name(C_split) << "_bm" << bm << "_bn" << bn << "_bk" << bk
+          << "_wm" << wm << "_wn" << wn << "_MN_"
+          << ((M % bm == 0 && N % bn == 0) ? "t" : "n") << "aligned"
+          << "_K_" << ((K % bk == 0) ? "t" : "n") << "aligned";
+
+    // Encode and dispatch gemm kernel
+    auto compute_encoder = d.get_command_encoder(s.index);
+    auto kernel = d.get_kernel(kname.str());
+    compute_encoder->setComputePipelineState(kernel);
+
+    int tn = (N + bn - 1) / bn;
+    int tm = (M + bm - 1) / bm;
+
+    GEMMSpiltKParams params{
+        M,
+        N,
+        K,
+        lda,
+        ldb,
+        N,
+        tn,
+        tm,
+        split_k_partitions,
+        split_k_partition_stride,
+        split_k_partition_size,
+        gemm_k_iterations};
+
+    MTL::Size group_dims = MTL::Size(32, wn, wm);
+    MTL::Size grid_dims = MTL::Size(tn, tm, split_k_partitions);
+
+    set_array_buffer(compute_encoder, a, 0);
+    set_array_buffer(compute_encoder, b, 1);
+    set_array_buffer(compute_encoder, C_split, 2);
+
+    compute_encoder->setBytes(&params, sizeof(GEMMSpiltKParams), 3);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+
+    // Do accum kernel
+    {
+      auto kernel = d.get_kernel(
+          "steel_gemm_splitk_accum_" + type_to_name(out) + "_" +
+          type_to_name(C_split) + "_axpby");
+      compute_encoder->setComputePipelineState(kernel);
+
+      // Set the arguments for the kernel
+      set_array_buffer(compute_encoder, C_split, 0);
+      set_array_buffer(compute_encoder, out, 1);
+      compute_encoder->setBytes(&split_k_partitions, sizeof(int), 2);
+      compute_encoder->setBytes(&split_k_partition_stride, sizeof(int), 3);
+      compute_encoder->setBytes(&N, sizeof(int), 4);
+      set_array_buffer(compute_encoder, c, 5);
+      compute_encoder->setBytes(&ldc, sizeof(int), 6);
+      compute_encoder->setBytes(&fdc, sizeof(int), 7);
+      compute_encoder->setBytes(&alpha_, sizeof(float), 8);
+      compute_encoder->setBytes(&beta_, sizeof(float), 9);
+
+      // Launch enough thread groups for each output
+      MTL::Size grid_dims = MTL::Size(N, M, 1);
+      MTL::Size group_dims = MTL::Size(std::min(1024, N * M), 1, 1);
+
+      compute_encoder->dispatchThreads(grid_dims, group_dims);
+    }
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+    return;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Regular addmm dispatch
+
+  // Determine dispatch kernel
+  int bm = 32, bn = 32, bk = 16;
+  int wm = 2, wn = 2;
+
+  if ((size_t)batch_size_out * M * N >= 1ul << 20) {
+    if (!transpose_a && transpose_b) {
+      bm = 64;
+      bn = (out.dtype() == float32) ? 64 : 32;
+      bk = (out.dtype() == float32) ? 16 : 32;
+    } else {
+      bm = 64;
+      bn = 64;
+    }
+  }
+
+  std::ostringstream kname;
+  kname << "steel_addmm_" << (transpose_a ? 't' : 'n')
+        << (transpose_b ? 't' : 'n') << "_" << type_to_name(a) << "_"
+        << type_to_name(out) << "_bm" << bm << "_bn" << bn << "_bk" << bk
+        << "_wm" << wm << "_wn" << wn << "_MN_"
+        << ((M % bm == 0 && N % bn == 0) ? "t" : "n") << "aligned"
+        << "_K_" << ((K % bk == 0) ? "t" : "n") << "aligned"
+        << ((alpha_ == 1. && beta_ == 1.) ? "_add" : "_axpby");
+
+  // Encode and dispatch kernel
+  auto compute_encoder = d.get_command_encoder(s.index);
+  auto kernel = d.get_kernel(kname.str());
+  compute_encoder->setComputePipelineState(kernel);
+
+  int tn = (N + bn - 1) / bn;
+  int tm = (M + bm - 1) / bm;
+
+  // TODO: Explore device-based tuning for swizzle
+  int swizzle_log = 0; // tm >= 6 ? 3 : (tm <= 3 ? 0 : 2);
+
+  GEMMAddMMParams params{
+      M,
+      N,
+      K,
+      lda,
+      ldb,
+      ldc,
+      N,
+      tn,
+      tm,
+      matrix_stride_a,
+      matrix_stride_b,
+      matrix_stride_c,
+      matrix_stride_out,
+      swizzle_log,
+      (K / bk),
+      alpha_,
+      beta_,
+      fdc};
+
+  int tile = 1 << swizzle_log;
+  tm = (tm + tile - 1) / tile;
+  tn = tn * tile;
+
+  MTL::Size group_dims = MTL::Size(32, wn, wm);
+  MTL::Size grid_dims = MTL::Size(tn, tm, batch_size_out);
+
+  // Launch only 1 kernel in the case of simple batching / broadcasting
+  if (batch_size_out == std::max(batch_size_a, batch_size_b) &&
+      (batch_size_a == batch_size_b ||
+       std::min(batch_size_a, batch_size_b) == 1)) {
+    set_array_buffer(compute_encoder, a, 0);
+    set_array_buffer(compute_encoder, b, 1);
+    set_array_buffer(compute_encoder, c, 2);
+    set_array_buffer(compute_encoder, out, 3);
+
+    compute_encoder->setBytes(&params, sizeof(GEMMAddMMParams), 4);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  } else { // Otherwise launch kernels with set offsets
+
+    MTL::Size grid_dims_single = MTL::Size(tn, tm, 1);
+
+    for (int i = 0; i < batch_size_out; ++i) {
+      auto a_off = elem_to_loc(M * K * i, a.shape(), a.strides());
+      auto b_off = elem_to_loc(K * N * i, b.shape(), b.strides());
+      auto c_off = elem_to_loc(M * N * i, c.shape(), c.strides());
+
+      auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
+      auto b_buf = static_cast<const MTL::Buffer*>(b.buffer().ptr());
+      auto c_buf = static_cast<const MTL::Buffer*>(c.buffer().ptr());
+      auto out_buf = static_cast<const MTL::Buffer*>(out.buffer().ptr());
+
+      compute_encoder->setBuffer(a_buf, a_off * a.itemsize(), 0);
+      compute_encoder->setBuffer(b_buf, b_off * b.itemsize(), 1);
+      compute_encoder->setBuffer(c_buf, c_off * c.itemsize(), 2);
+      compute_encoder->setBuffer(out_buf, i * M * N * out.itemsize(), 3);
+
+      compute_encoder->setBytes(&params, sizeof(GEMMAddMMParams), 4);
+      compute_encoder->dispatchThreadgroups(grid_dims_single, group_dims);
+    }
+  }
+
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+  return;
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/matmul.h
+++ b/mlx/backend/metal/matmul.h
@@ -12,7 +12,7 @@

 namespace mlx::core {

-void mlx_matmul(
+void steel_matmul(
    const Stream& s,
    metal::Device& d,
    const array& a,
--- a/mlx/backend/metal/metal.cpp
+++ b/mlx/backend/metal/metal.cpp
@@ -4,7 +4,6 @@
 #include <future>
 #include <memory>

-#include "mlx/array.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/primitives.h"
 #include "mlx/scheduler.h"
@@ -43,42 +42,57 @@ MTL::CommandBuffer* increment_command_buffer(Stream s) {
  return command_buffer;
 }

+inline void check_error(MTL::CommandBuffer* cbuf) {
+  if (cbuf->status() == MTL::CommandBufferStatusError) {
+    std::ostringstream msg;
+    msg << "[METAL] Command buffer execution failed: "
+        << cbuf->error()->localizedDescription()->utf8String();
+    throw std::runtime_error(msg.str());
+  }
+}
+
 std::function<void()> make_task(
    array& arr,
    std::vector<std::shared_future<void>> deps,
-    std::shared_ptr<std::promise<void>> p,
-    bool retain_graph) {
-  auto task =
-      [retain_graph, arr, deps = std::move(deps), p = std::move(p)]() mutable {
-        auto pool = new_scoped_memory_pool();
-        for (auto& d : deps) {
-          d.wait();
-        }
-        auto s = arr.primitive().stream();
-        auto command_buffer = increment_command_buffer(s);
-        arr.primitive().eval_gpu(arr.inputs(), arr);
-        if (p) {
-          metal::device(s.device).end_encoding(s.index);
-          scheduler::notify_new_task(s);
-          command_buffer->addCompletedHandler(
-              [retain_graph, s, arr, p = std::move(p)](
-                  MTL::CommandBuffer*) mutable {
-                if (!retain_graph) {
-                  arr.detach();
-                }
-                p->set_value();
-                scheduler::notify_task_completion(s);
-              });
-          metal::device(s.device).commit_command_buffer(s.index);
-        } else {
-          command_buffer->addCompletedHandler(
-              [retain_graph, s, arr](MTL::CommandBuffer*) mutable {
-                if (!retain_graph) {
-                  arr.detach();
-                }
-              });
-        }
-      };
+    std::shared_ptr<std::promise<void>> p) {
+  auto task = [arr, deps = std::move(deps), p = std::move(p)]() mutable {
+    auto pool = new_scoped_memory_pool();
+    for (auto& d : deps) {
+      d.wait();
+    }
+    auto s = arr.primitive().stream();
+    auto command_buffer = increment_command_buffer(s);
+    auto outputs = arr.outputs();
+    arr.primitive().eval_gpu(arr.inputs(), outputs);
+    std::vector<std::shared_ptr<array::Data>> buffers;
+    for (auto& in : arr.inputs()) {
+      buffers.push_back(in.data_shared_ptr());
+    }
+    for (auto& s : arr.siblings()) {
+      buffers.push_back(s.data_shared_ptr());
+    }
+    if (!arr.is_tracer()) {
+      arr.detach();
+    }
+
+    if (p) {
+      metal::device(s.device).end_encoding(s.index);
+      scheduler::notify_new_task(s);
+      command_buffer->addCompletedHandler(
+          [s, buffers = std::move(buffers), p = std::move(p)](
+              MTL::CommandBuffer* cbuf) {
+            p->set_value();
+            scheduler::notify_task_completion(s);
+            check_error(cbuf);
+          });
+      metal::device(s.device).commit_command_buffer(s.index);
+    } else {
+      command_buffer->addCompletedHandler(
+          [s, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
+            check_error(cbuf);
+          });
+    }
+  };
  return task;
 }

--- a/mlx/backend/metal/metal.h
+++ b/mlx/backend/metal/metal.h
@@ -19,13 +19,15 @@ constexpr bool is_available() {
 #endif
 }

+bool cache_enabled(void);
+void set_cache_enabled(bool enabled);
+
 void new_stream(Stream stream);
 std::shared_ptr<void> new_scoped_memory_pool();

 std::function<void()> make_task(
    array& arr,
    std::vector<std::shared_future<void>> deps,
-    std::shared_ptr<std::promise<void>> p,
-    bool retain_graph);
+    std::shared_ptr<std::promise<void>> p);

 } // namespace mlx::core::metal
--- a/mlx/backend/metal/primitives.cpp
+++ b/mlx/backend/metal/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <algorithm>
 #include <cassert>
@@ -21,13 +21,19 @@ static constexpr int METAL_MAX_INDEX_ARRAYS = 10;

 void binary_op(
    const std::vector<array>& inputs,
-    array& out,
+    std::vector<array>& outputs,
    const std::string op) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
+  set_binary_op_output_data(a, b, outputs[0], bopt, true);
+  set_binary_op_output_data(a, b, outputs[1], bopt, true);
+
+  auto& out = outputs[0];
+  if (out.size() == 0) {
+    return;
+  }

  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
@@ -54,7 +60,7 @@ void binary_op(
      break;
  }
  kname << op << type_to_name(a);
-  if (bopt == General && out.ndim() <= MAX_BINARY_SPECIALIZED_DIMS) {
+  if (bopt == General && shape.size() <= MAX_BINARY_SPECIALIZED_DIMS) {
    kname << "_" << shape.size();
  }

@@ -63,8 +69,108 @@ void binary_op(
  auto kernel = d.get_kernel(kname.str());
  auto compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, a, 0);
-  set_array_buffer(compute_encoder, b, 1);
+  // - If a is donated it goes to the first output
+  // - If b is donated it goes to the first output if a was not donated
+  //   otherwise it goes to the second output
+  bool donate_a = a.data_shared_ptr() == nullptr;
+  bool donate_b = b.data_shared_ptr() == nullptr;
+  set_array_buffer(compute_encoder, donate_a ? outputs[0] : a, 0);
+  set_array_buffer(
+      compute_encoder, donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, 1);
+  set_array_buffer(compute_encoder, outputs[0], 2);
+  set_array_buffer(compute_encoder, outputs[1], 3);
+
+  if (bopt == General) {
+    auto ndim = shape.size();
+    if (ndim > 3) {
+      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 4);
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 5);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 6);
+    } else {
+      // The shape is implicit in the grid for <= 3D
+      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
+      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
+    }
+
+    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
+      compute_encoder->setBytes(&ndim, sizeof(int), 7);
+    }
+
+    // Launch up to 3D grid of threads
+    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
+    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
+    size_t rest = out.size() / (dim0 * dim1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size != 1024) {
+      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
+    }
+    auto group_dims = get_block_dims(dim0, dim1, rest);
+    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+  } else {
+    // Launch a 1D grid of threads
+    size_t nthreads = out.data_size();
+    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size > nthreads) {
+      thread_group_size = nthreads;
+    }
+    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+  }
+}
+
+void binary_op(
+    const std::vector<array>& inputs,
+    array& out,
+    const std::string op) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt, true);
+  if (out.size() == 0) {
+    return;
+  }
+
+  // Try to collapse contiguous dims
+  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
+  auto& strides_a = strides[0];
+  auto& strides_b = strides[1];
+  auto& strides_out = strides[2];
+
+  std::ostringstream kname;
+  switch (bopt) {
+    case ScalarScalar:
+      kname << "ss";
+      break;
+    case ScalarVector:
+      kname << "sv";
+      break;
+    case VectorScalar:
+      kname << "vs";
+      break;
+    case VectorVector:
+      kname << "vv";
+      break;
+    case General:
+      kname << "g";
+      break;
+  }
+  kname << op << type_to_name(a);
+  if (bopt == General && shape.size() <= MAX_BINARY_SPECIALIZED_DIMS) {
+    kname << "_" << shape.size();
+  }
+
+  auto& s = out.primitive().stream();
+  auto& d = metal::device(s.device);
+  auto kernel = d.get_kernel(kname.str());
+  auto compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder->setComputePipelineState(kernel);
+  bool donate_a = a.data_shared_ptr() == nullptr;
+  bool donate_b = b.data_shared_ptr() == nullptr;
+  set_array_buffer(compute_encoder, donate_a ? out : a, 0);
+  set_array_buffer(compute_encoder, donate_b ? out : b, 1);
  set_array_buffer(compute_encoder, out, 2);

  if (bopt == General) {
@@ -114,14 +220,21 @@ void unary_op(
  auto& in = inputs[0];
  bool contig = in.flags().contiguous;
  if (contig) {
-    out.set_data(
-        allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-        in.data_size(),
-        in.strides(),
-        in.flags());
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+      out.move_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
+  if (in.size() == 0) {
+    return;
+  }

  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);
@@ -139,7 +252,8 @@ void unary_op(

  auto compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
+  set_array_buffer(
+      compute_encoder, in.data_shared_ptr() == nullptr ? out : in, 0);
  set_array_buffer(compute_encoder, out, 1);
  if (!contig) {
    compute_encoder->setBytes(in.shape().data(), in.ndim() * sizeof(int), 2);
@@ -171,6 +285,9 @@ void arange_set_scalars(T start, T next, MTL::ComputeCommandEncoder* enc) {
 void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto kernel = d.get_kernel("arange" + type_to_name(out));
@@ -298,9 +415,18 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
    compute_encoder->setComputePipelineState(kernel);
    set_array_buffer(compute_encoder, in, 0);
    set_array_buffer(compute_encoder, out, 1);
-    compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 2);
-    compute_encoder->setBytes(in_strides.data(), ndim * sizeof(size_t), 3);
-    compute_encoder->setBytes(out_strides.data(), ndim * sizeof(size_t), 4);
+    if (ndim == 0) {
+      // Pass place holders so metal doesn't complain
+      int shape_ = 0;
+      size_t stride_ = 0;
+      compute_encoder->setBytes(&shape_, sizeof(int), 2);
+      compute_encoder->setBytes(&stride_, sizeof(size_t), 3);
+      compute_encoder->setBytes(&stride_, sizeof(size_t), 4);
+    } else {
+      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 2);
+      compute_encoder->setBytes(in_strides.data(), ndim * sizeof(size_t), 3);
+      compute_encoder->setBytes(out_strides.data(), ndim * sizeof(size_t), 4);
+    }
    compute_encoder->setBytes(&ndim, sizeof(size_t), 5);
    compute_encoder->setBytes(&axis_stride, sizeof(size_t), 6);
    compute_encoder->setBytes(&axis_size, sizeof(size_t), 7);
@@ -360,10 +486,28 @@ void Cosh::eval_gpu(const std::vector<array>& inputs, array& out) {
  unary_op(inputs, out, "cosh");
 }

+void CustomVJP::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  eval(inputs, outputs);
+}
+
+void Depends::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  eval(inputs, outputs);
+}
+
 void Divide::eval_gpu(const std::vector<array>& inputs, array& out) {
  binary_op(inputs, out, "div");
 }

+void DivMod::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  binary_op(inputs, outputs, "divmod");
+}
+
 void Remainder::eval_gpu(const std::vector<array>& inputs, array& out) {
  binary_op(inputs, out, "rem");
 }
@@ -439,6 +583,20 @@ void LogicalNot::eval_gpu(const std::vector<array>& inputs, array& out) {
  unary_op(inputs, out, "lnot");
 }

+void LogicalAnd::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(
+      inputs,
+      out,
+      "land"); // Assume "land" is the operation identifier for logical AND
+}
+
+void LogicalOr::eval_gpu(const std::vector<array>& inputs, array& out) {
+  binary_op(
+      inputs,
+      out,
+      "lor"); // Assume "lor" is the operation identifier for logical OR
+}
+
 void LogAddExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  binary_op(inputs, out, "lae");
 }
@@ -517,6 +675,9 @@ void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  size_t elems_per_key = out.size() / num_keys;
  size_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }

  size_t out_per_key = (bytes_per_key + 4 - 1) / 4;
  size_t half_size = out_per_key / 2;
@@ -591,6 +752,12 @@ void Sinh::eval_gpu(const std::vector<array>& inputs, array& out) {
  unary_op(inputs, out, "sinh");
 }

+void Split::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  eval(inputs, outputs);
+}
+
 void Square::eval_gpu(const std::vector<array>& inputs, array& out) {
  unary_op(inputs, out, "square");
 }
@@ -627,4 +794,10 @@ void Transpose::eval_gpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
 }

+void QRF::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  throw std::runtime_error("[QRF::eval_gpu] Metal QR factorization NYI.");
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/quantized.cpp
+++ b/mlx/backend/metal/quantized.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <cassert>

@@ -52,7 +52,7 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      auto kernel = d.get_kernel(kname.str());
      compute_encoder->setComputePipelineState(kernel);

-      int bo = 32;
+      int bo = std::min(32, O);
      int bd = 32;
      MTL::Size group_dims = MTL::Size(bd, bo, 1);
      MTL::Size grid_dims = MTL::Size(1, O / bo, B);
@@ -72,7 +72,7 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
    else {
      std::ostringstream kname;
      kname << "qmm_t_" << type_to_name(out) << "_gs_" << group_size_ << "_b_"
-            << bits_;
+            << bits_ << "_alN_" << std::boolalpha << ((O % 32) == 0);

      // Encode and dispatch kernel
      auto compute_encoder = d.get_command_encoder(s.index);
@@ -85,7 +85,7 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      int bn = 32;
      int bk = 64;
      MTL::Size group_dims = MTL::Size(32, wn, wm);
-      MTL::Size grid_dims = MTL::Size(O / bn, (B + bm - 1) / bm, 1);
+      MTL::Size grid_dims = MTL::Size((O + bn - 1) / bn, (B + bm - 1) / bm, 1);

      set_array_buffer(compute_encoder, x, 0);
      set_array_buffer(compute_encoder, w, 1);
@@ -110,10 +110,10 @@ void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
      auto kernel = d.get_kernel(kname.str());
      compute_encoder->setComputePipelineState(kernel);

-      int bo = 32;
+      int bo = std::min(32, O);
      int bd = 32;
      MTL::Size group_dims = MTL::Size(bd, bo, 1);
-      MTL::Size grid_dims = MTL::Size(1, (w.shape(1) + bo - 1) / bo, B);
+      MTL::Size grid_dims = MTL::Size(1, (O + bo - 1) / bo, B);

      set_array_buffer(compute_encoder, x, 0);
      set_array_buffer(compute_encoder, w, 1);
--- a/mlx/backend/metal/reduce.cpp
+++ b/mlx/backend/metal/reduce.cpp
@@ -2,7 +2,6 @@

 #include <algorithm>
 #include <cassert>
-#include <iostream>
 #include <sstream>

 #include "mlx/backend/common/reduce.h"
@@ -21,46 +20,103 @@ namespace mlx::core {

 namespace {

+inline auto safe_div(size_t n, size_t m) {
+  return m == 0 ? 0 : (n + m - 1) / m;
+}
+
+inline auto safe_divup(size_t n, size_t m) {
+  return safe_div(n, m) * m;
+}
+
+inline bool is_64b_int(Dtype dtype) {
+  return dtype == int64 || dtype == uint64;
+}
+
 // All Reduce
 void all_reduce_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    MTL::ComputeCommandEncoder* compute_encoder,
-    metal::Device& d) {
-  // Get kernel and encode buffers
-  size_t in_size = in.size();
-  auto kernel = d.get_kernel("all_reduce_" + op_name + type_to_name(in));
+    metal::Device& d,
+    const Stream& s) {
+  Dtype out_dtype = out.dtype();
+  bool is_out_64b_int = is_64b_int(out_dtype);
+  auto kernel = (is_out_64b_int)
+      ? d.get_kernel("all_reduce_no_atomics_" + op_name + type_to_name(in))
+      : d.get_kernel("all_reduce_" + op_name + type_to_name(in));

  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
-  set_array_buffer(compute_encoder, out, 1);
-  compute_encoder->setBytes(&in_size, sizeof(size_t), 2);
-
-  // Set grid dimensions

  // We make sure each thread has enough to do by making it read in
  // at least n_reads inputs
  int n_reads = REDUCE_N_READS;
+  size_t in_size = in.size();

  // mod_in_size gives us the groups of n_reads needed to go over the entire
  // input
  uint mod_in_size = (in_size + n_reads - 1) / n_reads;
-
  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  thread_group_size =
      mod_in_size > thread_group_size ? thread_group_size : mod_in_size;
+  uint simd_size = kernel->threadExecutionWidth();
+  thread_group_size =
+      ((thread_group_size + simd_size - 1) / simd_size) * simd_size;

  // If the number of thread groups needed exceeds 1024, we reuse threads groups
-  uint n_thread_groups =
-      (mod_in_size + thread_group_size - 1) / thread_group_size;
+  uint n_thread_groups = safe_div(mod_in_size, thread_group_size);
  n_thread_groups = std::min(n_thread_groups, 1024u);
  uint nthreads = n_thread_groups * thread_group_size;

  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
  MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);

-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  // Encode buffers and dispatch
+  if (is_out_64b_int == false || n_thread_groups == 1) {
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&in_size, sizeof(size_t), 2);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+  } else {
+    // Allocate intermediate array to store partial reduction results
+    size_t intermediate_size = n_thread_groups;
+    array intermediate =
+        array({static_cast<int>(intermediate_size)}, out_dtype, nullptr, {});
+    intermediate.set_data(allocator::malloc_or_wait(intermediate.nbytes()));
+    std::vector<array> intermediates = {intermediate};
+
+    // First dispatch
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, intermediate, 1);
+    compute_encoder->setBytes(&in_size, sizeof(size_t), 2);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+    // Second pass to reduce intermediate reduction results written to DRAM
+    set_array_buffer(compute_encoder, intermediate, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&intermediate_size, sizeof(size_t), 2);
+
+    mod_in_size = (intermediate_size + n_reads - 1) / n_reads;
+
+    thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    thread_group_size =
+        mod_in_size > thread_group_size ? thread_group_size : mod_in_size;
+    thread_group_size =
+        ((thread_group_size + simd_size - 1) / simd_size) * simd_size;
+
+    // If the number of thread groups needed exceeds 1024, we reuse threads
+    // groups
+    nthreads = thread_group_size;
+    group_dims = MTL::Size(thread_group_size, 1, 1);
+    grid_dims = MTL::Size(nthreads, 1, 1);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [intermediates](MTL::CommandBuffer*) mutable {
+          intermediates.clear();
+        });
+  }
 }

 void row_reduce_general_dispatch(
@@ -70,22 +126,31 @@ void row_reduce_general_dispatch(
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    MTL::ComputeCommandEncoder* compute_encoder,
-    metal::Device& d) {
-  auto kernel =
-      d.get_kernel("row_reduce_general_" + op_name + type_to_name(in));
+    metal::Device& d,
+    const Stream& s) {
+  Dtype out_dtype = out.dtype();
+  bool is_out_64b_int = is_64b_int(out_dtype);
+  auto kernel = (is_out_64b_int)
+      ? d.get_kernel(
+            "row_reduce_general_no_atomics_" + op_name + type_to_name(in))
+      : d.get_kernel("row_reduce_general_" + op_name + type_to_name(in));
+
+  compute_encoder->setComputePipelineState(kernel);

  // Prepare the arguments for the kernel
  int n_reads = REDUCE_N_READS;
  size_t reduction_size = plan.shape.back();
-  size_t out_size = out.size();
  auto shape = plan.shape;
  auto strides = plan.strides;
+
  shape.pop_back();
  strides.pop_back();
+
  size_t non_row_reductions = 1;
  for (auto s : shape) {
    non_row_reductions *= static_cast<size_t>(s);
  }
+  size_t out_size = out.size();
  auto [rem_shape, rem_strides] = shapes_without_reduction_axes(in, axes);
  for (auto s : rem_shape) {
    shape.push_back(s);
@@ -95,16 +160,6 @@ void row_reduce_general_dispatch(
  }
  int ndim = shape.size();

-  // Set the arguments for the kernel
-  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
-  set_array_buffer(compute_encoder, out, 1);
-  compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
-  compute_encoder->setBytes(&out_size, sizeof(size_t), 3);
-  compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 4);
-  compute_encoder->setBytes(strides.data(), strides.size() * sizeof(size_t), 5);
-  compute_encoder->setBytes(&ndim, sizeof(int), 6);
-
  // Each thread group is responsible for 1 output
  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  thread_group_size =
@@ -121,7 +176,88 @@ void row_reduce_general_dispatch(
  MTL::Size grid_dims = MTL::Size(n_threads, non_row_reductions, 1);
  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);

-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  if (is_out_64b_int == false || non_row_reductions == 1) {
+    // Set the arguments for the kernel
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 3);
+    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 4);
+    compute_encoder->setBytes(
+        strides.data(), strides.size() * sizeof(size_t), 5);
+    compute_encoder->setBytes(&ndim, sizeof(int), 6);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+  } else {
+    // Allocate intermediate array to store partial reduction results
+    array intermediate = array(
+        {static_cast<int>(out.size()), static_cast<int>(non_row_reductions)},
+        out_dtype,
+        nullptr,
+        {});
+    intermediate.set_data(allocator::malloc_or_wait(intermediate.nbytes()));
+    std::vector<array> intermediates = {intermediate};
+
+    // Set the arguments for the kernel
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, intermediate, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 3);
+    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 4);
+    compute_encoder->setBytes(
+        strides.data(), strides.size() * sizeof(size_t), 5);
+    compute_encoder->setBytes(&ndim, sizeof(int), 6);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+    // Set up second dispatch
+    reduction_size = non_row_reductions;
+    out_size = 1;
+
+    // Shape of axes that aren't participating in reduction remains unchanged.
+    std::vector<int> new_shape = rem_shape;
+
+    // Update their strides since they'll be different post partial reduction in
+    // first compute dispatch.
+    std::vector<size_t> new_strides = rem_strides;
+    new_strides.back() = reduction_size;
+    for (int i = new_shape.size() - 2; i >= 0; i--) {
+      new_strides[i] = new_shape[i + 1] * new_strides[i + 1];
+    }
+    ndim = new_shape.size();
+
+    // Set the arguments for the kernel
+    set_array_buffer(compute_encoder, intermediate, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 3);
+    compute_encoder->setBytes(
+        new_shape.data(), new_shape.size() * sizeof(int), 4);
+    compute_encoder->setBytes(
+        new_strides.data(), new_strides.size() * sizeof(size_t), 5);
+    compute_encoder->setBytes(&ndim, sizeof(int), 6);
+
+    // Each thread group is responsible for 1 output
+    thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    thread_group_size =
+        std::min((reduction_size + n_reads - 1) / n_reads, thread_group_size);
+
+    // Align thread group size with simd_size
+    thread_group_size =
+        (thread_group_size + simd_size - 1) / simd_size * simd_size;
+    assert(thread_group_size <= kernel->maxTotalThreadsPerThreadgroup());
+
+    // Launch enough thread groups for each output
+    n_threads = thread_group_size;
+    grid_dims = MTL::Size(n_threads, out.size(), 1);
+    group_dims = MTL::Size(thread_group_size, 1, 1);
+
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [intermediates](MTL::CommandBuffer*) mutable {
+          intermediates.clear();
+        });
+  }
 }

 void strided_reduce_general_dispatch(
@@ -131,9 +267,16 @@ void strided_reduce_general_dispatch(
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    MTL::ComputeCommandEncoder* compute_encoder,
-    metal::Device& d) {
-  auto kernel =
-      d.get_kernel("col_reduce_general_" + op_name + type_to_name(in));
+    metal::Device& d,
+    const Stream& s) {
+  Dtype out_dtype = out.dtype();
+  bool is_out_64b_int = is_64b_int(out_dtype);
+  auto kernel = (is_out_64b_int)
+      ? d.get_kernel(
+            "col_reduce_general_no_atomics_" + op_name + type_to_name(in))
+      : d.get_kernel("col_reduce_general_" + op_name + type_to_name(in));
+
+  compute_encoder->setComputePipelineState(kernel);

  // Prepare the arguments for the kernel
  size_t reduction_size = plan.shape.back();
@@ -156,19 +299,7 @@ void strided_reduce_general_dispatch(
  }
  int ndim = shape.size();

-  // Set the arguments for the kernel
-  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
-  set_array_buffer(compute_encoder, out, 1);
-  compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
-  compute_encoder->setBytes(&reduction_stride, sizeof(size_t), 3);
-  compute_encoder->setBytes(&out_size, sizeof(size_t), 4);
-  compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 5);
-  compute_encoder->setBytes(strides.data(), strides.size() * sizeof(size_t), 6);
-  compute_encoder->setBytes(&ndim, sizeof(int), 7);
-
  // Select block dimensions
-
  // Each thread reads 16 inputs to give it more work
  uint n_inputs_per_thread = REDUCE_N_READS;
  uint n_threads_per_output =
@@ -177,14 +308,22 @@ void strided_reduce_general_dispatch(
  // We spread outputs over the x dimension and inputs over the y dimension
  // Threads with the same lid.x in a given threadgroup work on the same
  // output and each thread in the y dimension accumulates for that output
+
+  // Threads with same lid.x, i.e. each column of threads work on same output
  uint threadgroup_dim_x = std::min(out_size, 128ul);
+
+  // Number of threads along y, is dependent on number of reductions needed.
  uint threadgroup_dim_y =
      kernel->maxTotalThreadsPerThreadgroup() / threadgroup_dim_x;
  threadgroup_dim_y = std::min(n_threads_per_output, threadgroup_dim_y);

+  // Derive number of thread groups along x, based on how many threads we need
+  // along x
  uint n_threadgroups_x =
      (out_size + threadgroup_dim_x - 1) / threadgroup_dim_x;

+  // Derive number of thread groups along y based on how many threads we need
+  // along y
  uint n_threadgroups_y =
      (n_threads_per_output + threadgroup_dim_y - 1) / threadgroup_dim_y;

@@ -193,17 +332,122 @@ void strided_reduce_general_dispatch(
      MTL::Size(n_threadgroups_x, n_threadgroups_y, non_col_reductions);
  MTL::Size group_dims = MTL::Size(threadgroup_dim_x, threadgroup_dim_y, 1);

-  // We set shared memory to be exploited here for reductions within a
-  // threadgroup - each thread must be able to update its accumulated output
-  // Note: Each threadgroup should have 32kB of data in threadgroup memory
-  //       and threadgroup_dim_x * threadgroup_dim_y <= 1024 by design
-  //       This should be fine for floats, but we might need to revisit
-  //       if we ever come to doubles. In that case, we should also cut
-  //       down the number of threads we launch in a threadgroup
-  compute_encoder->setThreadgroupMemoryLength(
-      threadgroup_dim_x * threadgroup_dim_y * out.itemsize(), 0);
+  if (is_out_64b_int == false) {
+    // Set the arguments for the kernel
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&reduction_stride, sizeof(size_t), 3);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 4);
+    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 5);
+    compute_encoder->setBytes(
+        strides.data(), strides.size() * sizeof(size_t), 6);
+    compute_encoder->setBytes(&ndim, sizeof(int), 7);

-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    // We set shared memory to be exploited here for reductions within a
+    // threadgroup - each thread must be able to update its accumulated output
+    // Note: Each threadgroup should have 32kB of data in threadgroup memory
+    //       and threadgroup_dim_x * threadgroup_dim_y <= 1024 by design
+    //       This should be fine for floats, but we might need to revisit
+    //       if we ever come to doubles. In that case, we should also cut
+    //       down the number of threads we launch in a threadgroup
+    compute_encoder->setThreadgroupMemoryLength(
+        safe_divup(threadgroup_dim_x * threadgroup_dim_y * out.itemsize(), 16),
+        0);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+
+  } else {
+    // Allocate intermediate array to store reduction results from all thread
+    // groups
+    array intermediate = array(
+        {static_cast<int>(out.size()),
+         static_cast<int>(n_threadgroups_y * non_col_reductions)},
+        out_dtype,
+        nullptr,
+        {});
+    intermediate.set_data(allocator::malloc_or_wait(intermediate.nbytes()));
+    std::vector<array> intermediates = {intermediate};
+
+    // Set the arguments for the kernel
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, intermediate, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&reduction_stride, sizeof(size_t), 3);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 4);
+    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), 5);
+    compute_encoder->setBytes(
+        strides.data(), strides.size() * sizeof(size_t), 6);
+    compute_encoder->setBytes(&ndim, sizeof(int), 7);
+
+    // We set shared memory to be exploited here for reductions within a
+    // threadgroup - each thread must be able to update its accumulated output
+    // Note: Each threadgroup should have 32kB of data in threadgroup memory
+    //       and threadgroup_dim_x * threadgroup_dim_y <= 1024 by design
+    //       This should be fine for floats, but we might need to revisit
+    //       if we ever come to doubles. In that case, we should also cut
+    //       down the number of threads we launch in a threadgroup
+    compute_encoder->setThreadgroupMemoryLength(
+        safe_divup(threadgroup_dim_x * threadgroup_dim_y * out.itemsize(), 16),
+        0);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+
+    // Perform second pass of reductions
+    // Reduce results of threadgroups along y, z from first pass, that
+    // collectively work on each output element.
+    reduction_size = n_threadgroups_y * non_col_reductions;
+    out_size = 1;
+
+    // Shape of axes that aren't participating in reduction remains unchanged.
+    std::vector<int> new_shape = rem_shape;
+
+    // Update their strides since they'll be different after a partial reduction
+    // post first compute dispatch.
+    std::vector<size_t> new_strides = rem_strides;
+    new_strides.back() = reduction_size;
+    for (int i = new_shape.size() - 2; i >= 0; i--) {
+      new_strides[i] = new_shape[i + 1] * new_strides[i + 1];
+    }
+    ndim = new_shape.size();
+
+    auto row_reduce_kernel = d.get_kernel(
+        "row_reduce_general_no_atomics_" + op_name +
+        type_to_name(intermediate));
+    compute_encoder->setComputePipelineState(row_reduce_kernel);
+    set_array_buffer(compute_encoder, intermediate, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&reduction_size, sizeof(size_t), 2);
+    compute_encoder->setBytes(&out_size, sizeof(size_t), 3);
+    compute_encoder->setBytes(
+        new_shape.data(), new_shape.size() * sizeof(int), 4);
+    compute_encoder->setBytes(
+        new_strides.data(), new_strides.size() * sizeof(size_t), 5);
+    compute_encoder->setBytes(&ndim, sizeof(int), 6);
+
+    // Each thread group is responsible for 1 output
+    size_t n_reads = REDUCE_N_READS;
+    size_t thread_group_size =
+        row_reduce_kernel->maxTotalThreadsPerThreadgroup();
+    thread_group_size =
+        std::min((reduction_size + n_reads - 1) / n_reads, thread_group_size);
+
+    // Align thread group size with simd_size
+    uint simd_size = row_reduce_kernel->threadExecutionWidth();
+    thread_group_size =
+        (thread_group_size + simd_size - 1) / simd_size * simd_size;
+    assert(thread_group_size <= kernel->maxTotalThreadsPerThreadgroup());
+
+    // Launch enough thread groups for each output
+    uint n_threads = thread_group_size;
+    grid_dims = MTL::Size(n_threads, out.size(), 1);
+    group_dims = MTL::Size(thread_group_size, 1, 1);
+
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+
+    d.get_command_buffer(s.index)->addCompletedHandler(
+        [intermediates](MTL::CommandBuffer*) mutable {
+          intermediates.clear();
+        });
+  }
 }

 } // namespace
@@ -216,19 +460,14 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  array in = inputs[0];

-  // TODO: Allow specific row and column reductions with types disabled
-  // due to atomics ?
-  if (size_of(in.dtype()) == 8) {
-    std::ostringstream msg;
-    msg << "[Reduce::eval_gpu] Does not support " << in.dtype();
-    throw std::runtime_error(msg.str());
-  }
-
  // Make sure no identity reductions trickle down here
  assert(!axes_.empty());

  // Continue with reduction operation
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  // Minimum of 4 bytes since we use size 4 structs for all reduce
+  // and metal will complain o/w
+  size_t min_bytes = std::max(out.nbytes(), 4ul);
+  out.set_data(allocator::malloc_or_wait(min_bytes));
  std::string op_name;
  switch (reduce_type_) {
    case Reduce::And:
@@ -270,7 +509,7 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  // Reduce
-  {
+  if (in.size() > 0) {
    std::vector<array> copies;
    ReductionPlan plan = get_reduction_plan(in, axes_);

@@ -287,7 +526,7 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
    // Reducing over everything and the data is all there no broadcasting or
    // slicing etc.
    if (plan.type == ContiguousAllReduce) {
-      all_reduce_dispatch(in, out, op_name, compute_encoder, d);
+      all_reduce_dispatch(in, out, op_name, compute_encoder, d, s);
    }

    // At least the last dimension is row contiguous and we are reducing over
@@ -295,7 +534,7 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
    else if (
        plan.type == ContiguousReduce || plan.type == GeneralContiguousReduce) {
      row_reduce_general_dispatch(
-          in, out, op_name, plan, axes_, compute_encoder, d);
+          in, out, op_name, plan, axes_, compute_encoder, d, s);
    }

    // At least the last two dimensions are contiguous and we are doing a
@@ -304,7 +543,7 @@ void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
        plan.type == ContiguousStridedReduce ||
        plan.type == GeneralStridedReduce) {
      strided_reduce_general_dispatch(
-          in, out, op_name, plan, axes_, compute_encoder, d);
+          in, out, op_name, plan, axes_, compute_encoder, d, s);
    }

    if (!copies.empty()) {
--- a/mlx/backend/no_metal/metal.cpp
+++ b/mlx/backend/no_metal/metal.cpp
@@ -14,10 +14,15 @@ std::shared_ptr<void> new_scoped_memory_pool() {
 std::function<void()> make_task(
    array& arr,
    std::vector<std::shared_future<void>> deps,
-    std::shared_ptr<std::promise<void>> p,
-    bool retain_graph) {
+    std::shared_ptr<std::promise<void>> p) {
  throw std::runtime_error(
      "[metal::make_task] Cannot make GPU task without metal backend");
 }

+// No cache for CPU only
+bool cache_enabled(void) {
+  return false;
+}
+void set_cache_enabled(bool) {}
+
 } // namespace mlx::core::metal
--- a/mlx/backend/no_metal/primitives.cpp
+++ b/mlx/backend/no_metal/primitives.cpp
@@ -1,7 +1,13 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include "mlx/primitives.h"

+#define NO_GPU_MULTI(func)                                             \
+  void func::eval_gpu(                                                 \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    throw std::runtime_error(#func " has no GPU implementation.");     \
+  }
+
 #define NO_GPU(func)                                                  \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    throw std::runtime_error(#func " has no GPU implementation.");    \
@@ -11,6 +17,7 @@ namespace mlx::core {

 NO_GPU(Abs)
 NO_GPU(Add)
+NO_GPU(AddMM)
 NO_GPU(Arange)
 NO_GPU(ArcCos)
 NO_GPU(ArcCosh)
@@ -30,6 +37,8 @@ NO_GPU(Convolution)
 NO_GPU(Copy)
 NO_GPU(Cos)
 NO_GPU(Cosh)
+NO_GPU_MULTI(CustomVJP)
+NO_GPU_MULTI(Depends)
 NO_GPU(Divide)
 NO_GPU(Remainder)
 NO_GPU(Equal)
@@ -48,6 +57,8 @@ NO_GPU(Load)
 NO_GPU(Log)
 NO_GPU(Log1p)
 NO_GPU(LogicalNot)
+NO_GPU(LogicalAnd)
+NO_GPU(LogicalOr)
 NO_GPU(LogAddExp)
 NO_GPU(Matmul)
 NO_GPU(Maximum)
@@ -72,6 +83,7 @@ NO_GPU(Sinh)
 NO_GPU(Slice)
 NO_GPU(Softmax)
 NO_GPU(Sort)
+NO_GPU_MULTI(Split)
 NO_GPU(Square)
 NO_GPU(Sqrt)
 NO_GPU(StopGradient)
@@ -79,5 +91,6 @@ NO_GPU(Subtract)
 NO_GPU(Tan)
 NO_GPU(Tanh)
 NO_GPU(Transpose)
-
+NO_GPU_MULTI(DivMod)
+NO_GPU_MULTI(QRF)
 } // namespace mlx::core
--- a/mlx/compile.cpp
+++ b/mlx/compile.cpp
@@ -0,0 +1,440 @@
+// Copyright © 2023-2024 Apple Inc.
+#include <cstdlib>
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "mlx/allocator.h"
+#include "mlx/primitives.h"
+#include "mlx/transforms.h"
+#include "mlx/transforms_impl.h"
+
+namespace mlx::core {
+
+namespace detail {
+
+bool& compiler_disabled() {
+  auto get_val = []() {
+    if (const char* buff_str = std::getenv("MLX_DISABLE_COMPILE")) {
+      return true;
+    } else {
+      return false;
+    }
+  };
+  static bool compiler_disabled_ = get_val();
+  return compiler_disabled_;
+}
+
+#define MAX_OPS_PER_BUFFER max_ops_per_buffer()
+
+using CompileFn = std::function<std::vector<array>(const std::vector<array>&)>;
+using ParentsMap =
+    std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>;
+
+template <typename T, typename... U>
+size_t getAddress(std::function<T(U...)> f) {
+  typedef T(fnType)(U...);
+  fnType** fnPointer = f.template target<fnType*>();
+  if (fnPointer == nullptr) {
+    throw std::invalid_argument(
+        "[compile] Cannot compile a non-addressable function.");
+  }
+  return (size_t)*fnPointer;
+}
+
+struct CompilerCache {
+  struct CacheEntry {
+    std::vector<array> inputs;
+    std::vector<array> outputs;
+    std::vector<array> tape;
+    bool empty{true};
+  };
+
+  // Returns a reference to a CacheEntry which can be updated
+  // by the caller to avoid copying large tapes / inputs / outputs
+  CacheEntry& find(size_t fun_id, const std::vector<array>& inputs) {
+    // Try to find the entry
+    auto [entry_it, inserted] = cache_.insert({fun_id, {}});
+    auto& entries = entry_it->second;
+    auto is_match = [](const std::vector<array>& in1,
+                       const std::vector<array>& in2) {
+      if (in1.size() != in2.size()) {
+        throw std::runtime_error(
+            "[compiler] Got different number of inputs to function,"
+            " this should never happen.");
+      }
+      for (int i = 0; i < in1.size(); ++i) {
+        if (in1[i].shape() != in2[i].shape()) {
+          return false;
+        }
+        if (in1[i].dtype() != in2[i].dtype()) {
+          return false;
+        }
+      }
+      return true;
+    };
+
+    // Loop over entries and check inputs match i.e. shapes and types must be
+    // equal. Note this could get really slow if one compiles the same
+    // function with many different shapes. May want to store entries in a
+    // more easily searchable structure.
+    for (auto& entry : entries) {
+      // Check the inputs match and return if so
+      if (is_match(inputs, entry.inputs)) {
+        return entry;
+      }
+    }
+    // Otherwise append a new cache entry
+    entries.push_back(CacheEntry{});
+    return entries.back();
+  };
+
+  void erase(size_t fun_id) {
+    cache_.erase(fun_id);
+  }
+
+ private:
+  CompilerCache() {
+    // Make sure the allocator is fully
+    // initialized before the compiler cache
+    allocator::allocator();
+  }
+  friend CompilerCache& compiler_cache();
+  std::unordered_map<size_t, std::vector<CacheEntry>> cache_;
+};
+
+CompilerCache& compiler_cache() {
+  static CompilerCache compiler_cache_;
+  return compiler_cache_;
+}
+
+std::pair<std::vector<array>, std::vector<array>> compile_trace(
+    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
+    const std::vector<array>& inputs) {
+  // Set the global tracing flag.
+  detail::InTracing in_tracing;
+
+  // Run the function on placeholder inputs
+  // to get compute graph
+  std::vector<array> tracer_inputs;
+  for (int i = 0; i < inputs.size(); ++i) {
+    array in(inputs[i].shape(), inputs[i].dtype(), nullptr, {});
+    in.set_tracer(true);
+    tracer_inputs.push_back(std::move(in));
+  }
+  return {tracer_inputs, fun(tracer_inputs)};
+}
+
+// Traverses the graph to build a tape and a map of array ids to their parents
+std::pair<std::vector<array>, ParentsMap> compile_dfs(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs) {
+  std::function<void(const array&)> recurse;
+  std::vector<array> tape;
+  std::unordered_set<std::uintptr_t> input_set;
+  std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>
+      parents_map;
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto in = inputs[i];
+    input_set.insert(in.id());
+  }
+
+  // DFS the graph to build the tape, and log parents and scalars
+  std::unordered_set<std::uintptr_t> cache;
+  recurse = [&](const array& a) {
+    auto id = a.id();
+    if (cache.find(id) != cache.end()) {
+      return;
+    }
+    for (int i = 0; i < a.inputs().size(); i++) {
+      auto& in = a.inputs()[i];
+      parents_map[in.id()].push_back({a, i});
+      for (auto& s : a.siblings()) {
+        parents_map[in.id()].push_back({s, i});
+      }
+      // Don't recurse on inputs (but add them to the tape for the purpose
+      // of future optimizations)
+      if (input_set.find(a.id()) == input_set.end()) {
+        recurse(in);
+      }
+    }
+    cache.insert(id);
+    for (auto& s : a.siblings()) {
+      cache.insert(s.id());
+    }
+    tape.push_back(a);
+  };
+  for (auto& a : outputs) {
+    recurse(a);
+  }
+  return {tape, parents_map};
+}
+
+// Simplify the tape. Note, this function modifies in-place both the tape and
+// the parents map to remove orphaned arrays
+void compile_simplify(
+    std::vector<array>& tape,
+    ParentsMap& parents_map,
+    const std::vector<array>& outputs,
+    int passes) {
+  // Helpers to identify identical scalars
+  std::map<std::pair<uint64_t, Dtype::Val>, array> scalars;
+  auto is_scalar = [](const array& a) {
+    return a.is_evaled() && a.ndim() == 0;
+  };
+  auto get_scalar_rep = [](const array& a) {
+    uint64_t v = 0;
+    int dtype;
+    switch (a.dtype().size) {
+      case 1:
+        v = *a.data<uint8_t>();
+        break;
+      case 4:
+        v = *a.data<uint32_t>();
+        break;
+      case 8:
+        v = *a.data<uint64_t>();
+        break;
+    }
+    return std::make_pair(v, a.dtype().val);
+  };
+
+  for (auto& a : tape) {
+    if (is_scalar(a)) {
+      scalars.insert({get_scalar_rep(a), a});
+    }
+  }
+
+  // Helper that fuses two arrays in the graph by setting the parents of the
+  // source to point to the destination
+  auto fuse = [&](array& dst, array& src) {
+    // Canonicalize the order of the primitives outputs
+    auto sources = src.outputs();
+    auto dests = dst.outputs();
+    // For each src parent, point it to the corresponding dest
+    for (int i = 0; i < sources.size(); ++i) {
+      auto src_parents = parents_map.find(sources[i].id());
+      if (src_parents == parents_map.end()) {
+        continue;
+      }
+      auto& pairs = parents_map[dests[i].id()];
+      for (auto& parent : src_parents->second) {
+        parent.first.inputs()[parent.second] = dests[i];
+        pairs.push_back(parent);
+      }
+      // Remove the source from the map to avoid fusing with it again
+      parents_map.erase(src_parents);
+    }
+  };
+
+  // Depth-1 array equivalence check.
+  auto array_equivalent = [](const array& a, const array& b) {
+    if (!a.has_primitive() || !b.has_primitive()) {
+      return false;
+    }
+    if (a.primitive_id() == b.primitive_id()) {
+      return false;
+    }
+    const auto& pa = a.primitive();
+    const auto& pb = b.primitive();
+    if (typeid(pa) != typeid(pb)) {
+      return false;
+    }
+
+    if (a.inputs().size() != b.inputs().size()) {
+      return false;
+    }
+
+    for (int i = 0; i < a.inputs().size(); i++) {
+      if (a.inputs()[i].id() != b.inputs()[i].id()) {
+        return false;
+      }
+    }
+
+    return pa.is_equivalent(pb);
+  };
+
+  // Pass 0: fuse scalars
+  std::vector<array> new_tape;
+  for (auto& arr : tape) {
+    // Check if we can fuse scalars
+    if (is_scalar(arr)) {
+      auto scalar = scalars.find(get_scalar_rep(arr));
+      if (scalar->second.id() != arr.id()) {
+        fuse(scalar->second, arr);
+        // Don't keep orphaned scalars in the tape
+        continue;
+      }
+    }
+    new_tape.push_back(std::move(arr));
+  }
+
+  tape = std::move(new_tape);
+
+  std::unordered_set<uintptr_t> output_set;
+  for (auto& o : outputs) {
+    output_set.insert(o.id());
+  }
+  // Pass 1..passes: fuse only keeping non-orphaned arrays in the tape
+  for (int pass = 0; pass < passes; ++pass) {
+    for (auto& arr : tape) {
+      // Helper to check if we can fuse the parents of the
+      // given array
+      auto maybe_fuse_parents = [&](auto& a) {
+        auto parents = parents_map.find(a.id());
+        if (parents != parents_map.end()) {
+          auto N = parents->second.size();
+          std::vector<bool> mask(N, false);
+          for (int i = 0; i < N; i++) {
+            if (mask[i]) {
+              continue;
+            }
+            for (int j = i + 1; j < N; j++) {
+              if (mask[j]) {
+                continue;
+              }
+              auto& src = parents->second[j].first;
+              auto& dst = parents->second[i].first;
+              if (src.id() != dst.id() && array_equivalent(src, dst)) {
+                fuse(dst, src);
+                mask[j] = true;
+              }
+            }
+          }
+          // Erase orphaned parents so we don't keep fusing with them
+          for (int i = N - 1; i > 0; --i) {
+            if (mask[i]) {
+              parents->second.erase(parents->second.begin() + i);
+            }
+          }
+          return false;
+        } else {
+          return output_set.find(a.id()) == output_set.end();
+        }
+      };
+
+      bool discard = maybe_fuse_parents(arr);
+      for (auto& s : arr.siblings()) {
+        discard &= maybe_fuse_parents(s);
+      }
+      // If an array and its siblings have no parents, and none of them are
+      // outputs, it is safe to remove it from the tape
+      if (!discard) {
+        new_tape.push_back(std::move(arr));
+      }
+    }
+    tape = std::move(new_tape);
+  }
+}
+
+std::vector<array> compile_replace(
+    const std::vector<array>& tape,
+    const std::vector<array>& trace_inputs,
+    const std::vector<array>& trace_outputs,
+    const std::vector<array>& inputs) {
+  std::unordered_map<uintptr_t, array> trace_to_real;
+  for (int i = 0; i < inputs.size(); ++i) {
+    trace_to_real.insert({trace_inputs[i].id(), inputs[i]});
+  }
+
+  for (auto& a : tape) {
+    // Arrays in the tape without primitives are constants
+    // and can be used directly
+    if (!a.has_primitive()) {
+      trace_to_real.insert({a.id(), a});
+    } else {
+      // Find real inputs
+      std::vector<array> real_inputs;
+      for (auto& in : a.inputs()) {
+        real_inputs.push_back(trace_to_real.at(in.id()));
+      }
+      if (a.siblings().empty()) {
+        auto real_a = array(
+            a.shape(), a.dtype(), a.primitive_ptr(), std::move(real_inputs));
+        trace_to_real.insert({a.id(), std::move(real_a)});
+      } else {
+        // Ensure the order is correct for multi-output primitives
+        std::vector<std::vector<int>> shapes;
+        std::vector<Dtype> types;
+        auto trace_out = a.outputs();
+        for (auto& o : trace_out) {
+          shapes.push_back(o.shape());
+          types.push_back(o.dtype());
+        }
+        auto real_out =
+            array::make_arrays(shapes, types, a.primitive_ptr(), real_inputs);
+        for (int i = 0; i < trace_out.size(); ++i) {
+          trace_to_real.insert({trace_out[i].id(), std::move(real_out[i])});
+        }
+      }
+    }
+  }
+
+  std::vector<array> outputs;
+  for (auto& o : trace_outputs) {
+    outputs.push_back(trace_to_real.at(o.id()));
+  }
+  return outputs;
+}
+
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
+    size_t fun_id) {
+  if (compiler_disabled()) {
+    return fun;
+  }
+  return [fun, fun_id](const std::vector<array>& inputs) {
+    // Find a cache entry with the correct inputs
+    auto& entry = compiler_cache().find(fun_id, inputs);
+
+    // No matching cache entry existed, so compile
+    if (entry.empty) {
+      // Mark the entry as not empty since we are about to fill it
+      entry.empty = false;
+      // Trace to build the graph
+      std::tie(entry.inputs, entry.outputs) = compile_trace(fun, inputs);
+
+      // DFS the graph and get a tape, and a map of array id to (parent,
+      // position in parent inputs)
+      std::unordered_map<uintptr_t, std::vector<std::pair<array, int>>>
+          parents_map;
+      std::tie(entry.tape, parents_map) =
+          compile_dfs(entry.inputs, entry.outputs);
+
+      // Simplify the tape
+      compile_simplify(entry.tape, parents_map, entry.outputs, /* passes */ 3);
+
+      // This is a good point to do more optimizations, e.g. kernel fusion to
+      // generate new primitives. The tape needs to be updated accordingly
+    }
+
+    // At this point we must have a tape, now replace the placeholders
+    // with real arrays that can be evaluated
+    return compile_replace(entry.tape, entry.inputs, entry.outputs, inputs);
+  };
+}
+
+void compile_erase(size_t fun_id) {
+  detail::compiler_cache().erase(fun_id);
+}
+
+} // namespace detail
+
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    const std::function<std::vector<array>(const std::vector<array>&)>& fun) {
+  if (detail::compiler_disabled()) {
+    return fun;
+  }
+  auto fun_id = detail::getAddress(fun);
+  return detail::compile(fun, fun_id);
+}
+
+void disable_compile() {
+  detail::compiler_disabled() = true;
+}
+
+void enable_compile() {
+  detail::compiler_disabled() = false;
+}
+
+} // namespace mlx::core
--- a/mlx/graph_utils.cpp
+++ b/mlx/graph_utils.cpp
@@ -12,9 +12,7 @@

 namespace mlx::core {

-using OptionalArrayRef = std::optional<std::reference_wrapper<const array>>;
-
-struct ArrayNames {
+struct NodeNamer {
  std::unordered_map<std::uintptr_t, std::string> names;

  std::string get_name(const array& x) {
@@ -37,37 +35,30 @@ struct ArrayNames {
 };

 void depth_first_traversal(
-    std::function<void(OptionalArrayRef, const array&, int)> callback,
+    std::function<void(array)> callback,
    const std::vector<array>& outputs) {
-  std::function<void(OptionalArrayRef, const array&, int)> recurse;
+  std::function<void(const array&)> recurse;
  std::unordered_set<std::uintptr_t> cache;
-  recurse = [&](OptionalArrayRef parent, const array& x, int input_index) {
+  recurse = [&](const array& x) {
    auto id = x.id();
    if (cache.find(id) != cache.end()) {
      return;
    }
    cache.insert(id);
-    for (int i = 0; i < x.inputs().size(); i++) {
-      recurse(x, x.inputs()[i], i);
+    for (auto& s : x.siblings()) {
+      cache.insert(s.id());
    }
-    callback(parent, x, input_index);
+    for (auto& in : x.inputs()) {
+      recurse(in);
+    }
+    callback(x);
  };

-  for (auto x : outputs) {
-    recurse(std::nullopt, x, 0);
+  for (auto& o : outputs) {
+    recurse(o);
  }
 }

-void depth_first_traversal(
-    std::function<void(const array&)> callback,
-    const std::vector<array>& outputs) {
-  depth_first_traversal(
-      [&callback](OptionalArrayRef p, const array& x, int input_index) {
-        callback(x);
-      },
-      outputs);
-}
-
 void print_graph(std::ostream& os, const std::vector<array>& outputs) {
  std::vector<array> tape;
  std::vector<array> inputs;
@@ -82,15 +73,11 @@ void print_graph(std::ostream& os, const std::vector<array>& outputs) {
      },
      outputs);

-  ArrayNames namer;
-  auto print_arr = [&namer, &os](const array& a) {
-    os << namer.get_name(a);
-    os << " [" << a.shape() << ", " << a.dtype() << "]";
-  };
-
-  auto print_arrs = [&](const std::vector<array>& arrs) {
+  NodeNamer namer;
+  auto print_arrs = [&namer, &os](std::vector<array> arrs) {
    for (auto& arr : arrs) {
-      print_arr(arr);
+      os << namer.get_name(arr);
+      os << " [" << arr.shape() << ", " << arr.dtype() << "]";
      if (&arr != &arrs.back()) {
        os << ", ";
      }
@@ -108,7 +95,7 @@ void print_graph(std::ostream& os, const std::vector<array>& outputs) {
    os << " ";
    print_arrs(arr.inputs());
    os << " -> ";
-    print_arr(arr);
+    print_arrs(arr.outputs());
    os << "\n";
  }
 }
@@ -116,26 +103,45 @@ void print_graph(std::ostream& os, const std::vector<array>& outputs) {
 void export_to_dot(std::ostream& os, const std::vector<array>& outputs) {
  os << "digraph {" << std::endl;

-  ArrayNames namer;
+  std::unordered_set<std::uintptr_t> output_set;
+  for (auto& o : outputs) {
+    output_set.insert(o.id());
+  }
+  std::unordered_set<std::uintptr_t> input_set;
+  NodeNamer namer;
  depth_first_traversal(
-      [&namer, &os](auto parent, const array& x, int input_index) {
-        os << "{ ";
+      [&](const array& x) {
        if (!x.has_primitive()) {
-          os << "rank=source; ";
+          input_set.insert(x.id());
+          os << "{ rank=source; " << namer.get_name(x) << "; }" << std::endl;
+          return;
        }
-        if (!parent) {
-          os << "rank=sink; ";
-        }
-        os << namer.get_name(x);
+
+        // Node for primitive
        if (x.has_primitive()) {
+          os << "{ ";
+          os << x.primitive_id();
          os << " [label =\"";
          x.primitive().print(os);
-          os << "\"]";
+          os << "\", shape=rectangle]";
+          os << "; }" << std::endl;
+          // Arrows to primitive's inputs
+          for (auto& a : x.inputs()) {
+            os << namer.get_name(a) << " -> " << x.primitive_id() << std::endl;
+          }
        }
-        os << "; }" << std::endl;

-        for (auto c : x.inputs()) {
-          os << namer.get_name(c) << " -> " << namer.get_name(x) << std::endl;
+        // Point outputs to their primitive
+        for (auto& a : x.outputs()) {
+          os << "{ ";
+          if (output_set.find(a.id()) != output_set.end()) {
+            os << "rank=sink; ";
+          }
+          os << namer.get_name(a);
+          os << "; }" << std::endl;
+          if (x.has_primitive()) {
+            os << x.primitive_id() << " -> " << namer.get_name(a) << std::endl;
+          }
        }
      },
      outputs);
--- a/mlx/io.h
+++ b/mlx/io.h
@@ -0,0 +1,55 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <variant>
+
+#include "mlx/array.h"
+#include "mlx/io/load.h"
+#include "mlx/ops.h"
+#include "mlx/stream.h"
+
+namespace mlx::core {
+
+/** Save array to out stream in .npy format */
+void save(std::shared_ptr<io::Writer> out_stream, array a);
+
+/** Save array to file in .npy format */
+void save(const std::string& file, array a);
+
+/** Load array from reader in .npy format */
+array load(std::shared_ptr<io::Reader> in_stream, StreamOrDevice s = {});
+
+/** Load array from file in .npy format */
+array load(const std::string& file, StreamOrDevice s = {});
+
+/** Load array map from .safetensors file format */
+std::unordered_map<std::string, array> load_safetensors(
+    std::shared_ptr<io::Reader> in_stream,
+    StreamOrDevice s = {});
+std::unordered_map<std::string, array> load_safetensors(
+    const std::string& file,
+    StreamOrDevice s = {});
+
+void save_safetensors(
+    std::shared_ptr<io::Writer> in_stream,
+    std::unordered_map<std::string, array>);
+void save_safetensors(
+    const std::string& file,
+    std::unordered_map<std::string, array>);
+
+using MetaData =
+    std::variant<std::monostate, array, std::string, std::vector<std::string>>;
+
+/** Load array map and metadata from .gguf file format */
+std::pair<
+    std::unordered_map<std::string, array>,
+    std::unordered_map<std::string, MetaData>>
+load_gguf(const std::string& file, StreamOrDevice s = {});
+
+void save_gguf(
+    std::string file,
+    std::unordered_map<std::string, array> array_map,
+    std::unordered_map<std::string, MetaData> meta_data = {});
+
+} // namespace mlx::core
--- a/mlx/io/CMakeLists.txt
+++ b/mlx/io/CMakeLists.txt
@@ -3,4 +3,43 @@ target_sources(
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/safetensor.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp
 )
+
+MESSAGE(STATUS "Downloading json")
+FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_MakeAvailable(json)
+target_include_directories(
+    mlx PUBLIC
+    $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>
+    $<INSTALL_INTERFACE:include/json>
+)
+install(
+    DIRECTORY ${json_SOURCE_DIR}/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/json
+    COMPONENT json_source
+)
+
+MESSAGE(STATUS "Downloading gguflib")
+FetchContent_Declare(gguflib
+    GIT_REPOSITORY     https://github.com/antirez/gguf-tools/
+    GIT_TAG            af7d88d808a7608a33723fba067036202910acb3 
+)
+FetchContent_MakeAvailable(gguflib)
+target_include_directories(
+    mlx PUBLIC
+    $<BUILD_INTERFACE:${gguflib_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include/gguflib>
+)
+install(
+  DIRECTORY ${gguflib_SOURCE_DIR}/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gguflib
+  COMPONENT gguflib_source
+)
+
+add_library(
+  gguflib STATIC
+  ${gguflib_SOURCE_DIR}/fp16.c
+  ${gguflib_SOURCE_DIR}/gguflib.c)
+target_link_libraries(mlx $<BUILD_INTERFACE:gguflib>)
--- a/Show More
+++ b/Show More