Move to nanobind v2 (#1316 )

CPU mx.linalg.cholesky_inverse and mx.linalg.tri_inv (#1307 )
* add cholesky inv + tri inv * always run tri_inv on cpu * consistent naming
2025-09-12 23:34:36 +08:00 · 2024-08-08 17:17:46 -07:00 · 2024-08-08 15:18:02 -07:00 · 2024-08-08 14:51:09 -07:00 · 2024-08-08 13:35:02 -07:00 · 2024-08-07 13:38:07 -07:00
351 changed files with 34196 additions and 11925 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,7 +31,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install git+https://github.com/wjakob/nanobind.git@4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -49,11 +49,6 @@ jobs:
          name: Run Python tests
          command: |
            python3 -m unittest discover python/tests -v
      # TODO: Reenable when extension api becomes stable
      # - run:
      #     name: Build example extension
      #     command: |
      #       cd examples/extensions && python3 -m pip install . 
      - run:
          name: Build CPP only
          command: |
@@ -69,18 +64,19 @@ jobs:
        default: "15.2.0"
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.large.gen1
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            brew install python@3.8
            brew install openmpi
            python3.8 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install git+https://github.com/wjakob/nanobind.git@4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -101,11 +97,14 @@ jobs:
            source env/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
-      # TODO: Reenable when extension api becomes stable
+            mpirun -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-      # - run:
+      - run:
-      #     name: Build example extension
+          name: Build example extension
-      #     command: |
+          command: |
-      #       cd examples/extensions && python3.11 -m pip install . 
+            source env/bin/activate
            cd examples/extensions
            pip install -r requirements.txt
            python setup.py build_ext -j8
      - store_test_results:
          path: test-results
      - run:
@@ -117,7 +116,13 @@ jobs:
          name: Run CPP tests
          command: |
            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
-            DEVICE=cpu ./build/tests/tests
+      - run:
          name: Build small binary
          command: |
            source env/bin/activate
            cd build/
            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel -DBUILD_SHARED_LIBS=ON -DMLX_BUILD_CPU=OFF -DMLX_BUILD_SAFETENSORS=OFF -DMLX_BUILD_GGUF=OFF -DMLX_METAL_JIT=ON
            make -j
  build_release:
    parameters:
@@ -132,18 +137,19 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.large.gen1
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            brew install python@<< parameters.python_version >>
            brew install openmpi
            python<< parameters.python_version >> -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install git+https://github.com/wjakob/nanobind.git@4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -207,7 +213,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install git+https://github.com/wjakob/nanobind.git@4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.3
+    rev: v18.1.4
    hooks:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.3.0
+    rev: 24.4.2
    hooks:
    -   id: black
 -   repo: https://github.com/pycqa/isort
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -7,15 +7,17 @@ with a short description of your contribution(s) below. For example:
 MLX was developed with contributions from the following individuals:
- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops.
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream` and safetensor support.
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
 - Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer. Implemented pooling layers and ``Upsample``.
 - Hinrik Snær Guðmundsson: Added `atleast_1d`, `atleast_2d`, `atleast_3d` ops.
 - Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
 - Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
 - AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
 - Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
 - Paul Paczuski: Improved stability of BCE loss calculation
 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,12 +15,16 @@ option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
 option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.10.0)
+  set(MLX_VERSION 0.16.2)
 endif()
 # --------------------- Processor tests -------------------------
@@ -79,15 +83,17 @@ elseif (MLX_BUILD_METAL)
                  OUTPUT_VARIABLE MACOS_VERSION
                  COMMAND_ERROR_IS_FATAL ANY)
-  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
+  if (${MACOS_VERSION} LESS 14.0)
  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
  else()
    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
  endif()
  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
  set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip)
  # Get the metal version
  execute_process(
    COMMAND zsh "-c" "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal -E -x metal -P - | tail -1 | tr -d '\n'"
    OUTPUT_VARIABLE MLX_METAL_VERSION
    COMMAND_ERROR_IS_FATAL ANY)
  FetchContent_Declare(
    metal_cpp
@@ -101,17 +107,20 @@ elseif (MLX_BUILD_METAL)
    $<INSTALL_INTERFACE:include/metal_cpp>
  )
  target_link_libraries(
-    mlx
+    mlx PUBLIC
    ${METAL_LIB}
    ${FOUNDATION_LIB}
    ${QUARTZ_LIB})
  add_compile_definitions("MLX_METAL_VERSION=${MLX_METAL_VERSION}")
 endif()
 if (MLX_BUILD_CPU)
  find_library(ACCELERATE_LIBRARY Accelerate)
  if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
-  target_link_libraries(mlx ${ACCELERATE_LIBRARY})
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
    add_compile_definitions(ACCELERATE_NEW_LAPACK)
  else()
    message(STATUS "Accelerate or arm neon not found, using default backend.")
@@ -134,7 +143,7 @@ else()
    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
-  target_link_libraries(mlx ${LAPACK_LIBRARIES})
+    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
    # List blas after lapack otherwise we may accidentally incldue an old version
    # of lapack.h from the include dirs of blas.
    find_package(BLAS REQUIRED)
@@ -149,7 +158,34 @@ else()
    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-  target_link_libraries(mlx ${BLAS_LIBRARIES})
+    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
  endif()
 else()
  set(MLX_BUILD_ACCELERATE OFF)
 endif()
 find_package(MPI)
 if (MPI_FOUND)
  execute_process(
    COMMAND zsh "-c" "mpirun --version"
    OUTPUT_VARIABLE MPI_VERSION
    ERROR_QUIET
  )
  if (${MPI_VERSION} MATCHES ".*Open MPI.*")
    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
  elseif (MPI_VERSION STREQUAL "")
    set(MPI_FOUND FALSE)
    message(
      WARNING
      "MPI found but mpirun is not available. Building without MPI."
    )
  else()
    set(MPI_FOUND FALSE)
    message(
      WARNING
      "MPI which is not OpenMPI found. Building without MPI."
    )
  endif() 
 endif()
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)
@@ -161,6 +197,14 @@ target_include_directories(
  $<INSTALL_INTERFACE:include>
 )
 FetchContent_Declare(fmt
  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
  GIT_TAG 10.2.1 
  EXCLUDE_FROM_ALL
 )
 FetchContent_MakeAvailable(fmt)
 target_link_libraries(mlx PRIVATE fmt::fmt-header-only)
 if (MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
--- a/README.md
+++ b/README.md
@@ -88,13 +88,13 @@ for more information on building the C++ and Python APIs from source.
 ## Contributing 
-Check out the [contribution guidelines](CONTRIBUTING.md) for more information
+Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
 on contributing to MLX. See the
 [docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
 information on building from source, and running tests.
 We are grateful for all of [our
-contributors](ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
+contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
 to MLX and wish to be acknowledged, please add your name to the list in your
 pull request.
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -185,7 +185,7 @@ def prelu(x: torch.Tensor) -> torch.Tensor:
 def mish(x: torch.Tensor) -> torch.Tensor:
    y = x
    for _ in range(100):
-        return torch.nn.functional.mish(y)
+        y = torch.nn.functional.mish(y)
    sync_if_needed(x)
@@ -283,6 +283,14 @@ def topk(axis, x):
    sync_if_needed(x)
@torch.no_grad()
 def step_function(x):
    y = x
    for i in range(100):
        y = torch.where(y < 0, 0, 1)
    sync_if_needed(x)
@torch.no_grad()
 def selu(x):
    y = x
@@ -446,5 +454,11 @@ if __name__ == "__main__":
    elif args.benchmark == "topk":
        print(bench(topk, axis, x))
    elif args.benchmark == "step":
        print(bench(step_function, x))
    elif args.benchmark == "selu":
        print(bench(selu, x))
    else:
-        raise ValueError("Unknown benchmark")
+        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -16,7 +16,9 @@ def run_or_raise(*args, **kwargs):
        result = run(*args, capture_output=True, **kwargs)
        return float(result.stdout)
    except ValueError:
-        raise ValueError(f"stdout: {result.stdout}\nstderr: {result.stderr}")
+        raise ValueError(
            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
        )
 def compare(args):
--- a/benchmarks/python/compile_bench.py
+++ b/benchmarks/python/compile_bench.py
@@ -9,7 +9,6 @@ from time_utils import time_fn
 def bench_gelu():
    def gelu(x):
        return x * (1 + mx.erf(x / math.sqrt(2))) / 2
@@ -51,7 +50,6 @@ def bench_gelu():
 def bench_layernorm():
    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    mx.eval(weight, bias)
--- a/benchmarks/python/conv1d_bench.py
+++ b/benchmarks/python/conv1d_bench.py
@@ -0,0 +1,123 @@
 import argparse
 import math
 import os
 import subprocess
 import time
 import mlx.core as mx
 import numpy as np
 import torch
 device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
 device_name = device_name.decode("utf-8").strip("\n")
 N_warmup = 10
 N_iter_bench = 100
 N_iter_func = 5
 def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()
    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9
 def make_mx_conv_1D(strides=1, padding=0, groups=1):
    def mx_conv_1D(a, b):
        ys = []
        for _ in range(N_iter_func):
            y = mx.conv1d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys
    return mx_conv_1D
 def make_pt_conv_1D(strides=1, padding=0, groups=1):
    @torch.no_grad()
    def pt_conv_1D(a, b):
        ys = []
        for _ in range(N_iter_func):
            y = torch.conv1d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys
    return pt_conv_1D
 def bench_shape(N, iH, C, wH, O, strides, padding, np_dtype, groups):
    scale = 1.0 / math.sqrt(wH * C)
    a_np = np.random.uniform(0, 0.5, (N, iH, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, wH, int(C / groups))).astype(np_dtype)
    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)
    a_pt = torch.from_numpy(a_np.transpose((0, 2, 1))).to("mps")
    b_pt = torch.from_numpy(b_np.transpose((0, 2, 1))).to("mps")
    torch.mps.synchronize()
    f_mx = make_mx_conv_1D(strides, padding, groups)
    f_pt = make_pt_conv_1D(strides, padding, groups)
    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)
    out_mx = mx.conv1d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv1d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 1))
    out_pt = out_pt.numpy(force=True)
    atol = 2e-5 if np_dtype == np.float32 else 1e-4
    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, iH, C)}, {(O, wH, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )
    return time_mlx, time_torch
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")
    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 5, 32, 1, 2, 1),
        (4, 32, 32, 5, 32, 1, 2, 2),
        (4, 32, 32, 5, 32, 1, 2, 4),
        (4, 32, 32, 5, 32, 1, 2, 8),
        (4, 32, 32, 5, 32, 1, 2, 8),
        (4, 32, 32, 5, 32, 1, 2, 16),
        (4, 32, 32, 5, 32, 1, 2, 32),
        (4, 32, 256, 5, 512, 1, 2, 2),
        (4, 32, 256, 5, 512, 1, 2, 128),
        (4, 32, 256, 5, 512, 1, 2, 256),
    )
    for dtype in dtypes:
        print("(N,  iH,  C),  (O,  wH,  C),   dtype,  stride, pads, groups, diff%")
        for N, iH, C, wH, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, iH, C, wH, O, strides, padding, np_dtype, groups
            )
            diff = time_torch / time_mlx - 1.0
            print(
                f"({N}, {iH:3d}, {C:3d}), ({O:3d}, {wH:2d}, {C:3d}), {dtype}, {strides:5d}, {padding:4d}, {groups:6d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -28,11 +28,11 @@ def bench(f, a, b):
    return (e - s) * 1e-9
-def make_mx_conv_2D(strides=(1, 1), padding=(0, 0)):
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = mx.conv2d(a, b, stride=strides, padding=padding)
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys
@@ -40,12 +40,12 @@ def make_mx_conv_2D(strides=(1, 1), padding=(0, 0)):
    return mx_conv_2D
-def make_pt_conv_2D(strides=(1, 1), padding=(0, 0)):
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = torch.conv2d(a, b, stride=strides, padding=padding)
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys
@@ -53,11 +53,12 @@ def make_pt_conv_2D(strides=(1, 1), padding=(0, 0)):
    return pt_conv_2D
-def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
-    b_np = np.random.uniform(-scale, scale, (O, kH, kW, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
        np_dtype
    )
    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)
@@ -67,15 +68,15 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):
    torch.mps.synchronize()
-    f_mx = make_mx_conv_2D(strides, padding)
+    f_mx = make_mx_conv_2D(strides, padding, groups)
-    f_pt = make_pt_conv_2D(strides, padding)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)
-    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv2d(
-        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)
@@ -84,7 +85,7 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):
    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
-            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )
    return time_mlx, time_torch
@@ -95,35 +96,40 @@ if __name__ == "__main__":
    dtypes = ("float32",)
    shapes = (
-        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2)),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
-        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2)),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
-        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2)),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
-        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
-        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
-        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
-        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2)),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2)),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )
    for dtype in dtypes:
-        print("(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  diff%")
+        print(
-        for N, H, W, C, kH, kW, O, strides, padding in shapes:
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
        )
        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
-                N, H, W, C, kH, kW, O, strides, padding, np_dtype
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0
            print(
-                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {100. * diff:+5.2f}%"
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/einsum_bench.py
+++ b/benchmarks/python/einsum_bench.py
@@ -0,0 +1,84 @@
 # Copyright © 2024 Apple Inc.
 import time
 import mlx.core as mx
 import numpy as np
 def timeit(fn, its=100, args=[]):
    for _ in range(5):
        fn(*args)
    tic = time.perf_counter()
    for _ in range(its):
        fn(*args)
    toc = time.perf_counter()
    return 1e3 * (toc - tic) / its
 def time_little_einsum_path():
    subscripts = "ik,kj->ij"
    x = mx.ones((32, 32))
    y = mx.ones((32, 32))
    mx_time = timeit(mx.einsum_path, args=(subscripts, x, y))
    x = np.array(x)
    y = np.array(y)
    np_time = timeit(np.einsum_path, args=(subscripts, x, y))
    print("Timing little einsum path...")
    print(f"MLX ... {mx_time:.3f} ms")
    print(f"NumPy... {np_time:.3f} ms")
 def time_big_einsum_path():
    chars = list("abcdefgh")
    char_to_dim = {c: v for v, c in enumerate(chars)}
    num_inputs = 10
    inputs = []
    subscripts = []
    for _ in range(num_inputs):
        subscript = np.random.choice(chars, size=5, replace=False).tolist()
        subscripts.append("".join(subscript))
        inputs.append(np.ones(list(char_to_dim[c] for c in subscript)))
    subscripts = ",".join(subscripts)
    np_time = timeit(np.einsum_path, args=(subscripts, *inputs))
    inputs = [mx.array(x) for x in inputs]
    mx_time = timeit(mx.einsum_path, args=(subscripts, *inputs))
    print("Timing big einsum path...")
    print(f"MLX ... {mx_time:.3f} ms")
    print(f"NumPy... {np_time:.3f} ms")
 def time_attention():
    def regular_attention(x):
        # shape [batch, sequence, num_heads, head_dim]
        queries, keys, values = x, x, x
        scores = queries.transpose(0, 2, 1, 3) @ keys.transpose(0, 2, 3, 1)
        scores = mx.softmax(scores, axis=-1)
        output = (scores @ values.transpose(0, 2, 1, 3)).swapaxes(1, 2)
        mx.eval(output)
    def einsum_attention(x):
        # shape [batch, sequence, num_heads, head_dim]
        queries, keys, values = x, x, x
        scores = mx.einsum("itjk,iujk->ijtu", queries, keys)
        scores = mx.softmax(scores, axis=-1)
        output = mx.einsum("ijtu,iujk->itjk", scores, values)
        mx.eval(output)
    x = mx.random.uniform(shape=(8, 512, 32, 128))
    regular_time = timeit(regular_attention, args=(x,))
    ein_time = timeit(einsum_attention, args=(x,))
    print("Timing einsum attention...")
    print(f"Regular ... {regular_time:.3f} ms")
    print(f"Einsum ... {ein_time:.3f} ms")
 if __name__ == "__main__":
    time_little_einsum_path()
    time_big_einsum_path()
    time_attention()
--- a/benchmarks/python/fft_bench.py
+++ b/benchmarks/python/fft_bench.py
@@ -0,0 +1,118 @@
 # Copyright © 2024 Apple Inc.
 import matplotlib
 import mlx.core as mx
 import numpy as np
 import sympy
 import torch
 from time_utils import measure_runtime
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 def bandwidth_gb(runtime_ms, system_size):
    bytes_per_fft = np.dtype(np.complex64).itemsize * 2
    bytes_per_gb = 1e9
    ms_per_s = 1e3
    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb
 def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
    def fft_mlx(x):
        if dim == 1:
            out = mx.fft.fft(x)
        elif dim == 2:
            out = mx.fft.fft2(x)
        mx.eval(out)
        return out
    def fft_mps(x):
        if dim == 1:
            out = torch.fft.fft(x)
        elif dim == 2:
            out = torch.fft.fft2(x)
        torch.mps.synchronize()
        return out
    bandwidths = []
    for n in fft_sizes:
        batch_size = system_size // n**dim
        shape = [batch_size] + [n for _ in range(dim)]
        if backend == "mlx":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = mx.array(x_np)
            mx.eval(x)
            fft = fft_mlx
        elif backend == "mps":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = torch.tensor(x_np, device="mps")
            torch.mps.synchronize()
            fft = fft_mps
        else:
            raise NotImplementedError()
        runtime_ms = measure_runtime(fft, x=x)
        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
        print(n, bandwidth)
        bandwidths.append(bandwidth)
    return np.array(bandwidths)
 def time_fft():
    x = np.array(range(2, 512))
    system_size = int(2**26)
    print("MLX GPU")
    with mx.stream(mx.gpu):
        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
    print("MPS GPU")
    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")
    print("CPU")
    system_size = int(2**20)
    with mx.stream(mx.cpu):
        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
    x = np.array(x)
    all_indices = x - x[0]
    radix_2to13 = (
        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
    )
    bluesteins = (
        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
    )
    for indices, name in [
        (all_indices, "All"),
        (radix_2to13, "Radix 2-13"),
        (bluesteins, "Bluestein's"),
    ]:
        # plot bandwidths
        print(name)
        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
        plt.title(f"MLX FFT Benchmark -- {name}")
        plt.xlabel("N")
        plt.ylabel("Bandwidth (GB/s)")
        plt.legend()
        plt.savefig(f"{name}.png")
        plt.clf()
    av_gpu_bandwidth = np.mean(gpu_bandwidths)
    av_mps_bandwidth = np.mean(mps_bandwidths)
    av_cpu_bandwidth = np.mean(cpu_bandwidths)
    print("Average bandwidths:")
    print("GPU:", av_gpu_bandwidth)
    print("MPS:", av_mps_bandwidth)
    print("CPU:", av_cpu_bandwidth)
    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
    print("Percent MLX faster than MPS: ", portion_faster * 100)
 if __name__ == "__main__":
    time_fft()
--- a/benchmarks/python/hadamard_bench.py
+++ b/benchmarks/python/hadamard_bench.py
@@ -0,0 +1,70 @@
 import argparse
 import matplotlib
 import mlx.core as mx
 import numpy as np
 from time_utils import measure_runtime
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 def had(x):
    y = mx.hadamard_transform(x)
    mx.eval(y)
 def copy(x):
    y = x + 1.0
    mx.eval(y)
 def run(dtype):
    system_size = 2**26
    outputs = {}
    for test_fn in (had, copy):
        for m in [1, 12, 20, 28]:
            if test_fn == copy:
                key = "copy"
            elif m == 1:
                key = "had_2^k"
            else:
                key = "had_m*2^k"
            outputs.setdefault(key, {})
            for k in range(7, 14):
                n = m * 2**k
                if n > 2**15:
                    continue
                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
                x = mx.array(x_np)
                runtime_ms = measure_runtime(test_fn, x=x)
                bytes_per_gb = 1e9
                ms_per_s = 1e3
                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
                bandwidth_gb = (
                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
                )
                print(n, bandwidth_gb)
                outputs[key][n] = bandwidth_gb
    colors = {
        "copy": "black",
        "had_2^k": "steelblue",
        "had_m*2^k": "skyblue",
    }
    for key, output in outputs.items():
        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
    plt.xlabel("N")
    plt.ylabel("Bandwidth (GB/s)")
    plt.legend()
    plt.savefig(f"bench_{dtype.__name__}.png")
    plt.clf()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fp16", action="store_true")
    args = parser.parse_args()
    dtype = np.float16 if args.fp16 else np.float32
    run(dtype)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -0,0 +1,62 @@
 import argparse
 import math
 import mlx.core as mx
 from time_utils import time_fn
 MAX_SEQ = 300
 START_SEQ = 100
 SEQ_INCREMENT = 50
 def time_self_attention_primitives():
    mx.random.seed(3)
    B = 2
    H = 38
    D = 64
    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
        q = mx.random.uniform(shape=(B, H, R, D))
        k = mx.random.uniform(shape=(B, H, R, D))
        v = mx.random.uniform(shape=(B, H, R, D))
        scale = 1.0 / math.sqrt(float(D))
        mx.eval(q, k, v)
        def sdpa_primitives(qs, ks, vs, alpha):
            s = (alpha * qs) @ ks.transpose(0, 1, 3, 2)
            p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
            o = p @ vs
            return o
        time_fn(sdpa_primitives, q, k, v, scale)
 def time_self_attention_sdpa():
    mx.random.seed(3)
    B = 2
    H = 38
    D = 64
    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
        q = mx.random.uniform(shape=(B, H, R, D))
        k = mx.random.uniform(shape=(B, H, R, D))
        v = mx.random.uniform(shape=(B, H, R, D))
        scale = 1.0 / math.sqrt(float(D))
        mx.eval(q, k, v)
        def sdpa_fused(qs, ks, vs, alpha):
            o = mx.fast.scaled_dot_product_attention(qs, ks, vs, scale=alpha)
            return o
        time_fn(sdpa_fused, q, k, v, scale)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser("MLX benchmarks.")
    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
    args = parser.parse_args()
    if args.gpu:
        mx.set_default_device(mx.gpu)
    else:
        mx.set_default_device(mx.cpu)
    time_self_attention_sdpa()
    time_self_attention_primitives()
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -0,0 +1,50 @@
 ################################################################################
 # Primary project setup.                                                       #
 ################################################################################
 PROJECT_NAME           = "MLX"
 OUTPUT_DIRECTORY       = build
 XML_OUTPUT             = xml
 HTML_OUTPUT            = html
 STRIP_FROM_PATH        = ../
 INPUT                  = ../mlx
 FILE_PATTERNS          = *.h
 EXCLUDE_PATTERNS       = */private/*
 CREATE_SUBDIRS         = NO
 FULL_PATH_NAMES        = YES
 RECURSIVE              = YES
 GENERATE_HTML          = YES
 GENERATE_LATEX         = NO
 GENERATE_XML           = YES
 XML_PROGRAMLISTING     = YES
 ################################################################################
 # Doxygen preprocessor / parser control.                                       #
 ################################################################################
 ENABLE_PREPROCESSING   = YES
 MACRO_EXPANSION        = YES
 EXPAND_ONLY_PREDEF     = NO
 SKIP_FUNCTION_MACROS   = NO
 ################################################################################
 # Compound extraction control.                                                 #
 ################################################################################
 EXTRACT_ALL            = YES
 EXTRACT_PACKAGE        = YES
 EXTRACT_STATIC         = YES
 CASE_SENSE_NAMES       = NO
 ################################################################################
 # Docstring control / customization.                                           #
 ################################################################################
 JAVADOC_AUTOBRIEF      = YES
 ################################################################################
 # Warning suppression.                                                         #
 ################################################################################
 QUIET                  = YES
 WARN_IF_UNDOCUMENTED   = NO
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,12 +2,16 @@
 ### Setup (do once)
-Install [sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html)
+Install Doxygen:
 for example with `conda`:
 ```
-conda install sphinx
+brew install doxygen
-pip install sphinx-book-theme
+```
 Install Python packages:
 ```
 pip install -r requirements.txt
 ```
 ### Build
@@ -15,7 +19,7 @@ pip install sphinx-book-theme
 Build the docs from `mlx/docs/`
 ```
-make html
+doxygen && make html
 ```
 View the docs by running a server in `mlx/docs/build/html/`:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
 sphinx
 breathe
 sphinx-book-theme
 mlx
--- a/docs/src/_templates/nn-module-template.rst
+++ b/docs/src/_templates/nn-module-template.rst
@@ -0,0 +1,20 @@
 {{ fullname | escape | underline}}
 .. currentmodule:: {{ module }}
 .. autoclass:: {{ objname }}
   {% block methods %}
   {% if methods %}
   .. rubric:: {{ _('Methods') }}
   .. autosummary::
   {% for item in methods %}
      {%- if item not in inherited_members and item != "__init__" %}
         ~{{ name }}.{{ item }}
      {%- endif %}
   {%- endfor %}
   {% endif %}
   {% endblock %}
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -22,6 +22,7 @@ extensions = [
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
    "sphinx.ext.napoleon",
    "breathe",
 ]
 python_use_unqualified_type_names = True
@@ -33,6 +34,9 @@ intersphinx_mapping = {
    "numpy": ("https://numpy.org/doc/stable/", None),
 }
 breathe_projects = {"mlx": "../build/xml"}
 breathe_default_project = "mlx"
 templates_path = ["_templates"]
 html_static_path = ["_static"]
 source_suffix = ".rst"
@@ -79,3 +83,15 @@ def setup(app):
 # -- Options for LaTeX output ------------------------------------------------
 latex_documents = [(main_doc, "MLX.tex", "MLX Documentation", author, "manual")]
 latex_elements = {
    "preamble": r"""
    \usepackage{enumitem}
    \setlistdepth{5}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\bullet$}
    \setlist[itemize,3]{label=$\bullet$}
    \setlist[itemize,4]{label=$\bullet$}
    \setlist[itemize,5]{label=$\bullet$}
    \renewlist{itemize}{itemize}{5}
 """,
 }
--- a/docs/src/cpp/ops.rst
+++ b/docs/src/cpp/ops.rst
@@ -3,4 +3,5 @@
 Operations
 ==========
-
+.. doxygengroup:: ops
   :content-only:
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -1,5 +1,5 @@
-Developer Documentation
+Custom Extensions in MLX
-=======================
+========================
 You can extend MLX with custom operations on the CPU or GPU. This guide
 explains how to do that with a simple example.
@@ -486,15 +486,14 @@ below.
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);
-        // Make sure the metal library is available and look for it
+        // Make sure the metal library is available
-        // in the same folder as this executable if needed
+        d.register_library("mlx_ext");
        d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
        // Make a kernel from this metal library
        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
        // Prepare to encode kernel
-        auto compute_encoder = d.get_command_encoder(s.index);
+        auto& compute_encoder = d.get_command_encoder(s.index);
        compute_encoder->setComputePipelineState(kernel);
        // Kernel parameters are registered with buffer indices corresponding to
@@ -503,11 +502,11 @@ below.
        size_t nelem = out.size();
        // Encode input arrays to kernel
-        set_array_buffer(compute_encoder, x, 0);
+        compute_encoder.set_input_array(x, 0);
-        set_array_buffer(compute_encoder, y, 1);
+        compute_encoder.set_input_array(y, 1);
        // Encode output arrays to kernel
-        set_array_buffer(compute_encoder, out, 2);
+        compute_encoder.set_output_array(out, 2);
        // Encode alpha and beta
        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -531,7 +530,7 @@ below.
        // Launch the grid with the given number of threads divided among
        // the given threadgroups
-        compute_encoder->dispatchThreads(grid_dims, group_dims);
+        compute_encoder.dispatchThreads(grid_dims, group_dims);
    }
 We can now call the :meth:`axpby` operation on both the CPU and the GPU!
@@ -825,7 +824,7 @@ Let's look at a simple script and its results:
    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
-    print(f"c correctness: {mx.all(c == 6.0).item()}")
+    print(f"c correct: {mx.all(c == 6.0).item()}")
 Output:
--- a/docs/src/dev/metal_debugger.rst
+++ b/docs/src/dev/metal_debugger.rst
@@ -32,10 +32,9 @@ work.
    trace_file = "mlx_trace.gputrace"
-    if not mx.metal.start_capture(trace_file):
+    # Make sure to run with MTL_CAPTURE_ENABLED=1 and
-      print("Make sure to run with MTL_CAPTURE_ENABLED=1 and "
+    # that the path trace_file does not already exist.
-            f"that the path {trace_file} does not already exist.")
+    mx.metal.start_capture(trace_file)
      exit(1)
    for _ in range(10):
      mx.eval(mx.add(a, b))
--- a/docs/src/examples/llama-inference.rst
+++ b/docs/src/examples/llama-inference.rst
@@ -15,7 +15,7 @@ module to concisely define the model architecture.
 Attention layer
 ^^^^^^^^^^^^^^^^
-We will start with the llama attention layer which notably uses the RoPE
+We will start with the Llama attention layer which notably uses the RoPE
 positional encoding. [1]_ In addition, our attention layer will optionally use a
 key/value cache that will be concatenated with the provided keys and values to
 support efficient inference.
--- a/docs/src/examples/mlp.rst
+++ b/docs/src/examples/mlp.rst
@@ -64,7 +64,7 @@ set:
 Next, setup the problem parameters and load the data. To load the data, you need our
 `mnist data loader
 <https://github.com/ml-explore/mlx-examples/blob/main/mnist/mnist.py>`_, which
-we will import as `mnist`.
+we will import as ``mnist``.
 .. code-block:: python
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -43,6 +43,7 @@ are the CPU and GPU.
   usage/function_transforms
   usage/compile
   usage/numpy
   usage/distributed
   usage/using_streams
 .. toctree::
@@ -69,6 +70,7 @@ are the CPU and GPU.
   python/metal
   python/nn
   python/optimizers
   python/distributed
   python/tree_utils
 .. toctree::
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -70,36 +70,36 @@ To build and install the MLX python library from source, first, clone MLX from
   git clone git@github.com:ml-explore/mlx.git mlx && cd mlx
 Install `nanobind <https://nanobind.readthedocs.io/en/latest/>`_ with:
 .. code-block:: shell
    pip install git+https://github.com/wjakob/nanobind.git
 Then simply build and install MLX using pip:
 .. code-block:: shell
-   env CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
+  CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
-For developing use an editable install:
+For developing, install the package with development dependencies, and use an
 editable install:
 .. code-block:: shell
-  env CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e .
+  CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e ".[dev]"
-To make sure the install is working run the tests with:
+Once the development dependencies are installed, you can build faster with:
 .. code-block:: shell
 CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext -j --inplace
 Run the tests with:
 .. code-block:: shell
  pip install ".[testing]"
  python -m unittest discover python/tests
-Optional: Install stubs to enable auto completions and type checking from your IDE:
+Optional: Install stubs to enable auto completions and type checking from your
 IDE:
 .. code-block:: shell
  pip install ".[dev]"
  python setup.py generate_stubs
 C++ API
@@ -153,11 +153,18 @@ should point to the path to the built metal library.
     - OFF
   * - MLX_BUILD_METAL
     - ON
   * - MLX_BUILD_CPU
     - ON
   * - MLX_BUILD_PYTHON_BINDINGS
     - OFF
   * - MLX_METAL_DEBUG
     - OFF
-
+   * - MLX_BUILD_SAFETENSORS
     - ON
   * - MLX_BUILD_GGUF
     - ON
   * - MLX_METAL_JIT
     - OFF
 .. note::
@@ -176,10 +183,37 @@ should point to the path to the built metal library.
      xcrun -sdk macosx --show-sdk-version
 Binary Size Minimization
 ~~~~~~~~~~~~~~~~~~~~~~~~
 To produce a smaller binary use the CMake flags ``CMAKE_BUILD_TYPE=MinSizeRel``
 and ``BUILD_SHARED_LIBS=ON``.
 The MLX CMake build has several additional options to make smaller binaries.
 For example, if you don't need the CPU backend or support for safetensors and
 GGUF, you can do:
 .. code-block:: shell
  cmake .. \
    -DCMAKE_BUILD_TYPE=MinSizeRel \
    -DBUILD_SHARED_LIBS=ON \
    -DMLX_BUILD_CPU=OFF \
    -DMLX_BUILD_SAFETENSORS=OFF \
    -DMLX_BUILD_GGUF=OFF \
    -DMLX_METAL_JIT=ON
 THE ``MLX_METAL_JIT`` flag minimizes the size of the MLX Metal library which
 contains pre-built GPU kernels. This substantially reduces the size of the
 Metal library by run-time compiling kernels the first time they are used in MLX
 on a given machine. Note run-time compilation incurs a cold-start cost which can
 be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
 Metal kernel cache persists accross reboots.
 Troubleshooting
 ^^^^^^^^^^^^^^^
 Metal not found
 ~~~~~~~~~~~~~~~
--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -24,6 +24,7 @@ Array
    array.any
    array.argmax
    array.argmin
    array.conj
    array.cos
    array.cummax
    array.cummin
@@ -57,3 +58,4 @@ Array
    array.transpose
    array.T
    array.var
    array.view
--- a/docs/src/python/devices_and_streams.rst
+++ b/docs/src/python/devices_and_streams.rst
@@ -16,3 +16,4 @@ Devices and Streams
   new_stream
   set_default_stream
   stream
   synchronize
--- a/docs/src/python/distributed.rst
+++ b/docs/src/python/distributed.rst
@@ -0,0 +1,19 @@
 .. _distributed:
 .. currentmodule:: mlx.core.distributed
 Distributed Communication
 ==========================
 MLX provides a distributed communication package using MPI. The MPI library is
 loaded at runtime; if MPI is available then distributed communication is also
 made available.
 .. autosummary::
   :toctree: _autosummary
    Group
    is_available
    init
    all_sum
    all_gather
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -8,5 +8,10 @@ Linear Algebra
 .. autosummary:: 
   :toctree: _autosummary 
    inv
    tri_inv
    norm
    cholesky
    cholesky_inv
    qr
    svd
--- a/docs/src/python/metal.rst
+++ b/docs/src/python/metal.rst
@@ -7,10 +7,13 @@ Metal
  :toctree: _autosummary
  is_available
  device_info
  get_active_memory
  get_peak_memory
  reset_peak_memory
  get_cache_memory
  set_memory_limit
  set_cache_limit
  clear_cache
  start_capture
  stop_capture
--- a/docs/src/python/nn.rst
+++ b/docs/src/python/nn.rst
@@ -173,6 +173,7 @@ In detail:
   :toctree: _autosummary
   value_and_grad
   quantize
 .. toctree::
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -17,6 +17,8 @@ simple functions.
   gelu_approx
   gelu_fast_approx
   glu
   hard_shrink
   hard_tanh
   hardswish
   leaky_relu
   log_sigmoid
@@ -29,6 +31,7 @@ simple functions.
   sigmoid
   silu
   softmax
   softmin
   softplus
   softshrink
   step
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -15,15 +15,21 @@ Layers
   BatchNorm
   Conv1d
   Conv2d
   Conv3d
   Dropout
   Dropout2d
   Dropout3d
   Embedding
   GELU
   GLU
   GroupNorm
   GRU
   HardShrink
   HardTanh
   Hardswish
   InstanceNorm
   LayerNorm
   LeakyReLU
   Linear
   LSTM
   MaxPool1d
@@ -31,16 +37,23 @@ Layers
   Mish
   MultiHeadAttention
   PReLU
   QuantizedEmbedding
   QuantizedLinear
   RMSNorm
   ReLU
   ReLU6
   RNN
   RoPE
   SELU
   Sequential
   SiLU
   SinusoidalPositionalEncoding
   Softmin
   Softshrink
   Softsign
   Softmax
   Softplus
   Step
   Tanh
   Transformer
   Upsample
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -10,6 +10,7 @@ Operations
   abs
   add
   addmm
   all
   allclose
   any
@@ -19,19 +20,27 @@ Operations
   arcsin
   arcsinh
   arctan
   arctan2
   arctanh
   argmax
   argmin
   argpartition
   argsort
   array_equal
   as_strided
   atleast_1d
   atleast_2d
   atleast_3d
   bitwise_and
   bitwise_or
   bitwise_xor
   block_masked_mm
   broadcast_to
   ceil
   clip
   concatenate
   conj
   conjugate
   convolve
   conv1d
   conv2d
@@ -42,11 +51,14 @@ Operations
   cummin
   cumprod
   cumsum
   degrees
   dequantize
   diag
   diagonal
   divide
   divmod
   einsum
   einsum_path
   equal
   erf
   erfinv
@@ -58,8 +70,11 @@ Operations
   floor
   floor_divide
   full
   gather_mm
   gather_qmm
   greater
   greater_equal
   hadamard_transform
   identity
   inner
   isclose
@@ -67,6 +82,8 @@ Operations
   isnan
   isneginf
   isposinf
   issubdtype
   left_shift
   less
   less_equal
   linspace
@@ -89,18 +106,24 @@ Operations
   minimum
   moveaxis
   multiply
   nan_to_num
   negative
   not_equal
   ones
   ones_like
   outer
   partition
   pad
   power
   prod
   quantize
   quantized_matmul
   radians
   reciprocal
   remainder
   repeat
   reshape
   right_shift
   round
   rsqrt
   save
@@ -131,11 +154,13 @@ Operations
   tensordot
   tile
   topk
   trace
   transpose
   tri
   tril
   triu
   var
   view
   where
   zeros
   zeros_like
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -1,5 +1,7 @@
 .. _optimizers:
 .. currentmodule:: mlx.optimizers
 Optimizers
 ==========
@@ -29,8 +31,48 @@ model's parameters and the **optimizer state**.
            # Compute the new parameters but also the optimizer state.
            mx.eval(model.parameters(), optimizer.state)
 Saving and Loading
 ------------------
 To serialize an optimizer, save its state. To load an optimizer, load and set
 the saved state. Here's a simple example:
 .. code-block:: python
   import mlx.core as mx
   from mlx.utils import tree_flatten, tree_unflatten
   import mlx.optimizers as optim
   optimizer = optim.Adam(learning_rate=1e-2)
   # Perform some updates with the optimizer
   model = {"w" : mx.zeros((5, 5))}
   grads = {"w" : mx.ones((5, 5))}
   optimizer.update(model, grads)
   # Save the state
   state = tree_flatten(optimizer.state)
   mx.save_safetensors("optimizer.safetensors", dict(state))
   # Later on, for example when loading from a checkpoint,
   # recreate the optimizer and load the state
   optimizer = optim.Adam(learning_rate=1e-2)
   state = tree_unflatten(list(mx.load("optimizer.safetensors").items()))
   optimizer.state = state
 Note, not every optimizer configuation parameter is saved in the state. For
 example, for Adam the learning rate is saved but the ``betas`` and ``eps``
 parameters are not. A good rule of thumb is if the parameter can be scheduled
 then it will be included in the optimizer state.
 .. toctree::
   optimizers/optimizer
   optimizers/common_optimizers
   optimizers/schedulers
 .. autosummary::
   :toctree: _autosummary
   clip_grad_norm
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -44,3 +44,4 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
   split
   truncated_normal
   uniform
   laplace
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -10,6 +10,7 @@ Transforms
   eval
   compile
   custom_function
   disable_compile
   enable_compile
   grad
--- a/docs/src/python/tree_utils.rst
+++ b/docs/src/python/tree_utils.rst
@@ -19,3 +19,5 @@ return python trees will be using the default python ``dict``, ``list`` and
   tree_flatten
   tree_unflatten
   tree_map
   tree_map_with_path
   tree_reduce
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -0,0 +1,166 @@
 .. _usage_distributed:
 Distributed Communication
 =========================
 .. currentmodule:: mlx.core.distributed
 MLX utilizes `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ to
 provide distributed communication operations that allow the computational cost
 of training or inference to be shared across many physical machines. You can
 see a list of the supported operations in the :ref:`API docs<distributed>`.
 .. note::
   A lot of operations may not be supported or not as fast as they should be.
   We are adding more and tuning the ones we have as we are figuring out the
   best way to do distributed computing on Macs using MLX.
 Getting Started
 ---------------
 MLX already comes with the ability to "talk" to MPI if it is installed on the
 machine. The minimal distributed program in MLX is as simple as:
 .. code:: python
    import mlx.core as mx
    world = mx.distributed.init()
    x = mx.distributed.all_sum(mx.ones(10))
    print(world.rank(), x)
 The program above sums the array ``mx.ones(10)`` across all
 distributed processes. If simply run with ``python``, however, only one
 process is launched and no distributed communication takes place.
 To launch the program in distributed mode we need to use ``mpirun`` or
 ``mpiexec`` depending on the MPI installation. The simplest possible way is the
 following:
 .. code:: shell
    $ mpirun -np 2 python test.py
    1 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
    0 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
 The above launches two processes on the same (local) machine and we can see
 both standard output streams. The processes send the array of 1s to each other
 and compute the sum which is printed. Launching with ``mpirun -np 4 ...`` would
 print 4 etc.
 Installing MPI
 ---------------
 MPI can be installed with Homebrew, using the Anaconda package manager or
 compiled from source. Most of our testing is done using ``openmpi`` installed
 with the Anaconda package manager as follows:
 .. code:: shell
    $ conda install openmpi
 Installing with Homebrew may require specifying the location of ``libmpi.dyld``
 so that MLX can find it and load it at runtime. This can simply be achieved by
 passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun``.
 .. code:: shell
    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
 Setting up Remote Hosts
 -----------------------
 MPI can automatically connect to remote hosts and set up the communication over
 the network if the remote hosts can be accessed via ssh. A good checklist to
 debug connectivity issues is the following:
 * ``ssh hostname`` works from all machines to all machines without asking for
  password or host confirmation
 * ``mpirun`` is accessible on all machines. You can call ``mpirun`` using its
  full path to force all machines to use a specific path.
 * Ensure that the ``hostname`` used by MPI is the one that you have configured
  in the ``.ssh/config`` files on all machines.
 .. note::
  For an example hostname ``foo.bar.com`` MPI can use only ``foo`` as
  the hostname passed to ssh if the current hostname matches ``*.bar.com``.
 An easy way to pass the host names to MPI is using a host file. A host file
 looks like the following, where ``host1`` and ``host2`` should be the fully
 qualified domain names or IPs for these hosts.
 .. code::
    host1 slots=1
    host2 slots=1
 When using MLX, it is very likely that you want to use 1 slot per host, ie one
 process per host.  The hostfile also needs to contain the current
 host if you want to run on the local host. Passing the host file to
 ``mpirun`` is simply done using the ``--hostfile`` command line argument.
 Training Example
 ----------------
 In this section we will adapt an MLX training loop to support data parallel
 distributed training. Namely, we will average the gradients across a set of
 hosts before applying them to the model.
 Our training loop looks like the following code snippet if we omit the model,
 dataset and optimizer initialization.
 .. code:: python
    model = ...
    optimizer = ...
    dataset = ...
    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
        optimizer.update(model, grads)
        return loss
    for x, y in dataset:
        loss = step(model, x, y)
        mx.eval(loss, model.parameters())
 All we have to do to average the gradients across machines is perform an
 :func:`all_sum` and divide by the size of the :class:`Group`. Namely we
 have to :func:`mlx.utils.tree_map` the gradients with following function.
 .. code:: python
    def all_avg(x):
        return mx.distributed.all_sum(x) / mx.distributed.init().size()
 Putting everything together our training loop step looks as follows with
 everything else remaining the same.
 .. code:: python
    from mlx.utils import tree_map
    def all_reduce_grads(grads):
        N = mx.distributed.init()
        if N == 1:
            return grads
        return tree_map(
                lambda x: mx.distributed.all_sum(x) / N,
                grads)
    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
        grads = all_reduce_grads(grads)  # <--- This line was added
        optimizer.update(model, grads)
        return loss
 Tuning All Reduce
 -----------------
 We are working on improving the performance of all reduce on MLX but for now
 the two main things one can do to extract the most out of distributed training with MLX are:
 1. Perform a few large reductions instead of many small ones to improve
   bandwidth and latency
 2. Pass ``--mca btl_tcp_links 4`` to ``mpirun`` to configure it to use 4 tcp
   connections between each host to improve bandwidth
--- a/docs/src/usage/lazy_evaluation.rst
+++ b/docs/src/usage/lazy_evaluation.rst
@@ -18,7 +18,7 @@ describe below.
 Transforming Compute Graphs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Lazy evaluation let's us record a compute graph without actually doing any
+Lazy evaluation lets us record a compute graph without actually doing any
 computations. This is useful for function transformations like :func:`grad` and
 :func:`vmap` and graph optimizations.
--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -3,7 +3,11 @@
 Conversion to NumPy and Other Frameworks
 ========================================
-MLX array implements the `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
+MLX array supports conversion between other frameworks with either:  
 * The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_. 
 * `DLPack <https://dmlc.github.io/dlpack/latest/>`_.  
 Let's convert an array to NumPy and back.
 .. code-block:: python
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -9,3 +9,4 @@ build_example(tutorial.cpp)
 build_example(linear_regression.cpp)
 build_example(logistic_regression.cpp)
 build_example(metal_capture.cpp)
 build_example(distributed.cpp)
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -0,0 +1,22 @@
 // Copyright © 2024 Apple Inc.
 #include <iostream>
 #include "mlx/mlx.h"
 using namespace mlx::core;
 int main() {
  if (!distributed::is_available()) {
    std::cout << "No communication backend found" << std::endl;
    return 1;
  }
  auto global_group = distributed::init();
  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;
  array x = ones({10});
  array out = distributed::all_sum(x, global_group);
  std::cout << out << std::endl;
 }
--- a/examples/cpp/metal_capture.cpp
+++ b/examples/cpp/metal_capture.cpp
@@ -11,7 +11,7 @@ int main() {
  // To use Metal debugging and profiling:
  // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
  // 2. Run with MTL_CAPTURE_ENABLED=1.
-  assert(metal::start_capture("mlx_trace.gputrace"));
+  metal::start_capture("mlx_trace.gputrace");
  // Start at index two because the default GPU and CPU streams have indices
  // zero and one, respectively. This naming matches the label assigned to each
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -89,8 +89,8 @@ void automatic_differentiation() {
  // dfdx is 2 * x
  // Get the second derivative by composing grad with grad
-  auto df2dx2 = grad(grad(fn))(x);
+  auto d2fdx2 = grad(grad(fn))(x);
-  // df2dx2 is 2
+  // d2fdx2 is 2
 }
 int main() {
--- a/examples/extensions/README.md
+++ b/examples/extensions/README.md
@@ -1,5 +1,5 @@
-## Build the extensions
+## Build
 ```
 pip install -e .
@@ -16,3 +16,9 @@ And then run:
 ```
 python setup.py build_ext -j8 --inplace
 ```
 ## Test
 ```
 python test.py
 ```
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -249,15 +249,14 @@ void Axpby::eval_gpu(
  kname << (contiguous_kernel ? "contiguous_" : "general_");
  kname << type_to_name(out);
-  // Make sure the metal library is available and look for it
+  // Make sure the metal library is available
-  // in the same folder as this executable if needed
+  d.register_library("mlx_ext");
  d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
  // Make a kernel from this metal library
  auto kernel = d.get_kernel(kname.str(), "mlx_ext");
  // Prepare to encode kernel
-  auto compute_encoder = d.get_command_encoder(s.index);
+  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  // Kernel parameters are registered with buffer indices corresponding to
@@ -266,11 +265,11 @@ void Axpby::eval_gpu(
  size_t nelem = out.size();
  // Encode input arrays to kernel
-  set_array_buffer(compute_encoder, x, 0);
+  compute_encoder.set_input_array(x, 0);
-  set_array_buffer(compute_encoder, y, 1);
+  compute_encoder.set_input_array(y, 1);
  // Encode output arrays to kernel
-  set_array_buffer(compute_encoder, out, 2);
+  compute_encoder.set_output_array(out, 2);
  // Encode alpha and beta
  compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -296,7 +295,7 @@ void Axpby::eval_gpu(
  // Launch the grid with the given number of threads divided among
  // the given threadgroups
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);
 }
 #else // Metal is not available
--- a/examples/extensions/axpby/axpby.metal
+++ b/examples/extensions/axpby/axpby.metal
@@ -36,8 +36,8 @@ template <typename T>
 }
 #define instantiate_axpby(type_name, type)                               \
-  template [[host_name("axpby_general_" #type_name)]] \
+  template [[host_name("axpby_general_" #type_name)]] [[kernel]] void    \
-  [[kernel]] void axpby_general<type>(                \
+  axpby_general<type>(                                                   \
      device const type* x [[buffer(0)]],                                \
      device const type* y [[buffer(1)]],                                \
      device type* out [[buffer(2)]],                                    \
@@ -48,8 +48,8 @@ template <typename T>
      constant const size_t* y_strides [[buffer(7)]],                    \
      constant const int& ndim [[buffer(8)]],                            \
      uint index [[thread_position_in_grid]]);                           \
-  template [[host_name("axpby_contiguous_" #type_name)]] \
+  template [[host_name("axpby_contiguous_" #type_name)]] [[kernel]] void \
-  [[kernel]] void axpby_contiguous<type>(                \
+  axpby_contiguous<type>(                                                \
      device const type* x [[buffer(0)]],                                \
      device const type* y [[buffer(1)]],                                \
      device type* out [[buffer(2)]],                                    \
--- a/examples/extensions/mlx_sample_extensions/init.py
+++ b/examples/extensions/mlx_sample_extensions/init.py
@@ -2,4 +2,4 @@
 import mlx.core as mx
-from .mlx_sample_extensions import *
+from ._ext import axpby
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -3,6 +3,6 @@ requires = [
  "setuptools>=42",
  "cmake>=3.24",
  "mlx>=0.9.0",
-  "nanobind@git+https://github.com/wjakob/nanobind.git#egg=4148debcf91f5ccab0c3b8d67b5c3cabd61f407f",
+  "nanobind@git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.24
-mlx>=0.9.0
+mlx>=0.16.2
-nanobind@git+https://github.com/wjakob/nanobind.git#egg=4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
+nanobind==2.0
--- a/examples/extensions/test.py
+++ b/examples/extensions/test.py
@@ -0,0 +1,10 @@
 import mlx.core as mx
 from mlx_sample_extensions import axpby
 a = mx.ones((3, 4))
 b = mx.ones((3, 4))
 c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
 print(f"c shape: {c.shape}")
 print(f"c dtype: {c.dtype}")
 print(f"c correct: {mx.all(c == 6.0).item()}")
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -6,6 +6,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
@@ -19,11 +20,17 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h
 )
 if (MLX_BUILD_CPU)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
 endif()
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
-else()
+elseif(MLX_BUILD_CPU)
  target_sources(
    mlx
    PRIVATE
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <functional>
 #include "mlx/array.h"
@@ -18,6 +17,10 @@ bool in_tracing() {
  return detail::InTracing::in_tracing();
 }
 bool retain_graph() {
  return detail::RetainGraph::retain_graph();
 }
 } // namespace
 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
@@ -93,13 +96,17 @@ void array::detach() {
 }
 void array::eval() {
-  if (!is_evaled()) {
+  // Ensure the array is ready to be read
  if (status() == Status::scheduled) {
    event().wait();
    set_status(Status::available);
  } else if (status() == Status::unscheduled) {
    mlx::core::eval({*this});
  }
 }
 bool array::is_tracer() const {
-  return array_desc_->is_tracer && in_tracing();
+  return array_desc_->is_tracer && in_tracing() || retain_graph();
 }
 void array::set_data(allocator::Buffer buffer, deleter_t d) {
@@ -163,6 +170,40 @@ void array::move_shared_buffer(array other) {
  move_shared_buffer(other, other.strides(), other.flags(), other.data_size());
 }
 array::~array() {
  if (array_desc_ == nullptr) {
    return;
  }
  // Ignore arrays that might be detached during eval
  if (status() == array::Status::scheduled) {
    return;
  }
  // Break circular reference for non-detached arrays with siblings
  if (auto n = siblings().size(); n > 0) {
    bool do_detach = true;
    // If all siblings have siblings.size() references except
    // the one we are currently destroying (which has siblings.size() + 1)
    // then there are no more external references
    do_detach &= (array_desc_.use_count() == (n + 1));
    for (auto& s : siblings()) {
      do_detach &= (s.array_desc_.use_count() == n);
      if (!do_detach) {
        break;
      }
    }
    if (do_detach) {
      for (auto& s : siblings()) {
        for (auto& ss : s.siblings()) {
          ss.array_desc_ = nullptr;
        }
        s.array_desc_->siblings.clear();
      }
    }
  }
 }
 void array::ArrayDesc::init() {
  strides.resize(shape.size());
  size = 1;
@@ -170,13 +211,13 @@ void array::ArrayDesc::init() {
    strides[i] = size;
    size *= shape[i];
  }
-  for (auto& in : inputs) {
+  for (const auto& in : inputs) {
    is_tracer |= in.is_tracer();
  }
 }
 array::ArrayDesc::ArrayDesc(std::vector<int> shape, Dtype dtype)
-    : shape(std::move(shape)), dtype(dtype) {
+    : shape(std::move(shape)), dtype(dtype), status(Status::available) {
  init();
 }
@@ -187,6 +228,7 @@ array::ArrayDesc::ArrayDesc(
    std::vector<array> inputs)
    : shape(std::move(shape)),
      dtype(dtype),
      status(Status::unscheduled),
      primitive(std::move(primitive)),
      inputs(std::move(inputs)) {
  init();
@@ -194,7 +236,7 @@ array::ArrayDesc::ArrayDesc(
 array::ArrayDesc::~ArrayDesc() {
  // When an array description is destroyed it will delete a bunch of arrays
-  // that may also destory their corresponding descriptions and so on and so
+  // that may also destroy their corresponding descriptions and so on and so
  // forth.
  //
  // This calls recursively the destructor and can result in stack overflow, we
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -9,6 +9,7 @@
 #include "mlx/allocator.h"
 #include "mlx/dtype.h"
 #include "mlx/event.h"
 namespace mlx::core {
@@ -72,32 +73,32 @@ class array {
      this->array_desc_ = other.array_desc_;
    }
    return *this;
-  };
+  }
  /** The size of the array's datatype in bytes. */
  size_t itemsize() const {
    return size_of(dtype());
-  };
+  }
  /** The number of elements in the array. */
  size_t size() const {
    return array_desc_->size;
-  };
+  }
  /** The number of bytes in the array. */
  size_t nbytes() const {
    return size() * itemsize();
-  };
+  }
  /** The number of dimensions of the array. */
  size_t ndim() const {
    return array_desc_->shape.size();
-  };
+  }
  /** The shape of the array as a vector of integers. */
  const std::vector<int>& shape() const {
    return array_desc_->shape;
-  };
+  }
  /**
   *  Get the size of the corresponding dimension.
@@ -106,17 +107,26 @@ class array {
   *  bounds checking. */
  int shape(int dim) const {
    return shape().at(dim < 0 ? dim + ndim() : dim);
-  };
+  }
  /** The strides of the array. */
  const std::vector<size_t>& strides() const {
    return array_desc_->strides;
-  };
+  }
  /**
   *  Get the stride of the corresponding dimension.
   *
   *  This function supports negative indexing and provides
   *  bounds checking. */
  size_t strides(int dim) const {
    return strides().at(dim < 0 ? dim + ndim() : dim);
  }
  /** Get the arrays data type. */
  Dtype dtype() const {
    return array_desc_->dtype;
-  };
+  }
  /** Evaluate the array. */
  void eval();
@@ -150,10 +160,10 @@ class array {
    friend bool operator==(const ArrayIterator& a, const ArrayIterator& b) {
      return a.arr.id() == b.arr.id() && a.idx == b.idx;
-    };
+    }
    friend bool operator!=(const ArrayIterator& a, const ArrayIterator& b) {
      return !(a == b);
-    };
+    }
   private:
    const array& arr;
@@ -199,7 +209,7 @@ class array {
    allocator::Buffer buffer;
    deleter_t d;
    Data(allocator::Buffer buffer, deleter_t d = allocator::free)
-        : buffer(buffer), d(d){};
+        : buffer(buffer), d(d) {}
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
@@ -220,22 +230,22 @@ class array {
  /** The array's primitive. */
  Primitive& primitive() const {
    return *(array_desc_->primitive);
-  };
+  }
  /** A shared pointer to the array's primitive. */
  std::shared_ptr<Primitive>& primitive_ptr() const {
    return array_desc_->primitive;
-  };
+  }
  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
-  };
+  }
  /** The array's inputs. */
  const std::vector<array>& inputs() const {
    return array_desc_->inputs;
-  };
+  }
  std::vector<array>& inputs() {
    return array_desc_->inputs;
@@ -249,24 +259,18 @@ class array {
  /** The array's siblings. */
  const std::vector<array>& siblings() const {
    return array_desc_->siblings;
-  };
+  }
  /** The array's siblings. */
  std::vector<array>& siblings() {
    return array_desc_->siblings;
  }
  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
    array_desc_->position = position;
  }
  /** The i-th output of the array's primitive. */
  const array& output(int i) const {
    if (i == array_desc_->position) {
      return *this;
    } else if (i < array_desc_->position) {
      return siblings()[i];
    } else {
      return siblings()[i + 1];
    }
  };
  /** The outputs of the array's primitive (i.e. this array and
   * its siblings) in the order the primitive expects. */
  std::vector<array> outputs() const {
@@ -277,7 +281,7 @@ class array {
    outputs.push_back(*this);
    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
    return outputs;
-  };
+  }
  /** Detach the array from the graph. */
  void detach();
@@ -285,19 +289,19 @@ class array {
  /** Get the Flags bit-field. */
  const Flags& flags() const {
    return array_desc_->flags;
-  };
+  }
  /** The size (in elements) of the underlying buffer the array points to. */
  size_t data_size() const {
    return array_desc_->data_size;
-  };
+  }
  allocator::Buffer& buffer() {
    return array_desc_->data->buffer;
-  };
+  }
  const allocator::Buffer& buffer() const {
    return array_desc_->data->buffer;
-  };
+  }
  // Return a copy of the shared pointer
  // to the array::Data struct
@@ -308,16 +312,35 @@ class array {
  template <typename T>
  T* data() {
    return static_cast<T*>(array_desc_->data_ptr);
-  };
+  }
  template <typename T>
  const T* data() const {
    return static_cast<T*>(array_desc_->data_ptr);
-  };
+  }
-  // Check if the array has been evaluated
+  enum Status { unscheduled, scheduled, available };
-  bool is_evaled() const {
+
-    return array_desc_->data != nullptr;
+  bool is_available() const {
    return status() == Status::available;
  }
  Status status() const {
    return array_desc_->status;
  }
  void set_status(Status s) const {
    array_desc_->status = s;
  }
  // Get the array's shared event
  Event& event() const {
    return array_desc_->event;
  }
  // Attach an event to a not yet evaluated array
  void attach_event(Event e) const {
    array_desc_->event = std::move(e);
  }
  // Mark the array as a tracer array (true) or not.
@@ -358,6 +381,8 @@ class array {
    array_desc_ = other.array_desc_;
  }
  ~array();
 private:
  // Initialize the arrays data
  template <typename It>
@@ -370,6 +395,11 @@ class array {
    Dtype dtype;
    std::shared_ptr<Primitive> primitive;
    Status status;
    // An event on the array used for synchronization
    Event event;
    // Indicates an array is being used in a graph transform
    // and should not be detached from the graph
    bool is_tracer{false};
@@ -470,10 +500,11 @@ T array::item() const {
  if (size() != 1) {
    throw std::invalid_argument("item can only be called on arrays of size 1.");
  }
-  if (!is_evaled()) {
+  if (status() == Status::unscheduled) {
    throw std::invalid_argument(
        "item() const can only be called on evaled arrays");
  }
  const_cast<array*>(this)->eval();
  return *data<T>();
 }
--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@@ -1,9 +1,9 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.
 #include <cassert>
 #include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
 #include <vecLib/vDSP.h>
 #include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -1,9 +1,8 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.
 #include <cassert>
-#include <vecLib/BNNS/bnns.h>
+#include <Accelerate/Accelerate.h>
 #include <vecLib/cblas_new.h>
 #include "mlx/backend/accelerate/utils.h"
 #include "mlx/backend/common/copy.h"
@@ -196,6 +195,40 @@ inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
  return matmul_bnns_general(a_pre, b_pre, out);
 }
 template <typename T>
 inline void mask_matrix(
    T* data,
    const bool* mask,
    int tile_size,
    const int X,
    const int Y,
    const size_t X_data_str,
    const size_t Y_data_str,
    const size_t X_mask_str,
    const size_t Y_mask_str) {
  int tX = (X + tile_size - 1) / tile_size;
  int tY = (Y + tile_size - 1) / tile_size;
  for (int i = 0; i < tX; i++) {
    for (int j = 0; j < tY; j++) {
      bool do_mask = mask[i * X_mask_str + j * Y_mask_str];
      if (!do_mask) {
        int loc_x = i * tile_size;
        int loc_y = j * tile_size;
        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
        int size_x = std::min(tile_size, X - loc_x);
        int size_y = std::min(tile_size, Y - loc_y);
        for (int ii = 0; ii < size_x; ii++) {
          for (int jj = 0; jj < size_y; jj++) {
            data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
          }
        }
      }
    }
  }
 }
 } // namespace
 void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -3,8 +3,7 @@
 #include <cassert>
 #include <cmath>
-#include <vecLib/vDSP.h>
+#include <Accelerate/Accelerate.h>
 #include <vecLib/vForce.h>
 #include "mlx/allocator.h"
 #include "mlx/backend/common/binary.h"
@@ -31,11 +30,13 @@ DEFAULT(ArgPartition)
 DEFAULT(ArgReduce)
 DEFAULT(ArgSort)
 DEFAULT(AsStrided)
 DEFAULT(BlockMaskedMM)
 DEFAULT(Broadcast)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
 DEFAULT(Conjugate)
 DEFAULT(Copy)
-DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(CustomTransforms)
 DEFAULT_MULTI(Depends)
 DEFAULT_MULTI(DivMod)
 DEFAULT(NumberOfElements)
@@ -45,8 +46,11 @@ DEFAULT(ErfInv)
 DEFAULT(FFT)
 DEFAULT(Floor)
 DEFAULT(Gather)
 DEFAULT(GatherMM)
 DEFAULT(GatherQMM)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
 DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
@@ -76,6 +80,7 @@ DEFAULT(StopGradient)
 DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 DEFAULT(Cholesky)
 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
@@ -97,7 +102,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -112,7 +117,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vadd((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -127,7 +132,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vaddi((const int*)a, 1, (const int*)b, 1, (int*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x + y; });
+    eval(inputs, out);
  }
 }
@@ -191,6 +196,26 @@ void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }
 void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  if (out.dtype() == float32 && a.flags().row_contiguous &&
      b.flags().row_contiguous) {
    if (a.is_donatable()) {
      out.copy_shared_buffer(a);
    } else if (b.is_donatable()) {
      out.copy_shared_buffer(b);
    } else {
      out.set_data(allocator::malloc_or_wait(out.nbytes()));
    }
    int size = a.data_size();
    vvatan2f(out.data<float>(), a.data<float>(), b.data<float>(), &size);
  } else {
    eval(inputs, out);
  }
 }
 void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -262,7 +287,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -275,7 +300,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdivi((const int*)b, 1, (const int*)a, 1, (int*)o, 1, n);
        });
  } else if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -290,7 +315,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdiv((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x / y; });
+    eval(inputs, out);
  }
 }
@@ -301,12 +326,8 @@ void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    auto size = in.data_size();
    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (issubdtype(out.dtype(), inexact)) {
    unary_fp(in, out, [](auto x) { return std::exp(x); });
  } else {
-    throw std::invalid_argument(
+    eval(inputs, out);
        "[exp] Cannot exponentiate elements in array"
        " with non floating point type.");
  }
 }
@@ -368,12 +389,8 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vvlog1pf(
        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (issubdtype(out.dtype(), inexact)) {
    unary_fp(in, out, [](auto x) { return std::log1p(x); });
  } else {
-    throw std::invalid_argument(
+    eval(inputs, out);
        "[log1p] Cannot compute log of elements in array with"
        " non floating point type.");
  }
 }
@@ -383,7 +400,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -398,7 +415,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vmul((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x * y; });
+    eval(inputs, out);
  }
 }
@@ -409,7 +426,7 @@ void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else {
-    unary(in, out, [](auto x) { return -x; });
+    eval(inputs, out);
  }
 }
@@ -496,7 +513,7 @@ void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
  } else {
-    unary(in, out, [](auto x) { return x * x; });
+    eval(inputs, out);
  }
 }
@@ -522,7 +539,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];
  if (a.dtype() == float32) {
-    binary(
+    binary_op<float>(
        a,
        b,
        out,
@@ -540,7 +557,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vsub((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary(
+    binary_op<int>(
        a,
        b,
        out,
@@ -552,7 +569,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
        },
        UseDefaultBinaryOp());
  } else {
-    binary(a, b, out, [](auto x, auto y) { return x - y; });
+    eval(inputs, out);
  }
 }
--- a/mlx/backend/accelerate/reduce.cpp
+++ b/mlx/backend/accelerate/reduce.cpp
@@ -2,8 +2,8 @@
 #include <cassert>
 #include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
 #include <vecLib/vDSP.h>
 #include "mlx/backend/common/reduce.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -3,7 +3,10 @@
 #include <cassert>
 #include <limits>
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_neon.h>
 #endif
 #include <simd/math.h>
 #include <simd/vector.h>
@@ -53,25 +56,26 @@ inline simd_float16 simd_fast_exp(simd_float16 x) {
  return (*(simd_float16*)&epart) * x;
 }
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /**
 * The ARM neon equivalent of the fast exp above.
 */
 inline float16x8_t neon_fast_exp(float16x8_t x) {
-  x = vmulq_f16(x, vdupq_n_f16(1.442695)); // multiply with log_2(e)
+  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
-  x = vmaxq_f16(x, vdupq_n_f16(-14)); // clamp under with -14
+  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
-  x = vminq_f16(x, vdupq_n_f16(14)); // clamp over with 14
+  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14
-  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(0.5)));
+  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
  float16x8_t fpart = vsubq_f16(x, ipart);
-  x = vdupq_n_f16(1.535336188319500e-4f);
+  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
-  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(9.618437357674640e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(5.550332471162809e-2f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(2.402264791363012e-1f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(6.931472028550421e-1f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(1.000000000000000f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);
  // generate 2**ipart in the floating point representation using integer
  // bitshifting
@@ -107,53 +111,6 @@ inline float16_t neon_reduce_add(float16x8_t x) {
  return vget_lane_f16(y, 0);
 }
 template <typename T, typename VT>
 struct AccelerateSimdOps {
  VT init(T a) {
    return a;
  }
  VT load(const T* a) {
    return *(VT*)a;
  }
  void store(T* dst, VT x) {
    *(VT*)dst = x;
  }
  VT max(VT a, VT b) {
    return simd_max(a, b);
  };
  VT exp(VT x) {
    return simd_fast_exp(x);
  }
  VT add(VT a, VT b) {
    return a + b;
  }
  VT sub(VT a, T b) {
    return a - b;
  }
  VT mul(VT a, VT b) {
    return a * b;
  }
  VT mul(VT a, T b) {
    return a * b;
  }
  T reduce_max(VT x) {
    return simd_reduce_max(x);
  }
  T reduce_add(VT x) {
    return simd_reduce_add(x);
  }
 };
 template <typename T, typename VT>
 struct NeonFp16SimdOps {
  VT init(T a) {
@@ -170,7 +127,7 @@ struct NeonFp16SimdOps {
  VT max(VT a, VT b) {
    return vmaxq_f16(a, b);
-  };
+  }
  VT exp(VT x) {
    return neon_fast_exp(x);
@@ -201,6 +158,55 @@ struct NeonFp16SimdOps {
  }
 };
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <typename T, typename VT>
 struct AccelerateSimdOps {
  VT init(T a) {
    return a;
  }
  VT load(const T* a) {
    return *(VT*)a;
  }
  void store(T* dst, VT x) {
    *(VT*)dst = x;
  }
  VT max(VT a, VT b) {
    return simd_max(a, b);
  }
  VT exp(VT x) {
    return simd_fast_exp(x);
  }
  VT add(VT a, VT b) {
    return a + b;
  }
  VT sub(VT a, T b) {
    return a - b;
  }
  VT mul(VT a, VT b) {
    return a * b;
  }
  VT mul(VT a, T b) {
    return a * b;
  }
  T reduce_max(VT x) {
    return simd_reduce_max(x);
  }
  T reduce_add(VT x) {
    return simd_reduce_add(x);
  }
 };
 template <typename T, typename AccT, typename VT, typename Ops, int N>
 void softmax(const array& in, array& out) {
  Ops ops;
@@ -362,12 +368,16 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
            AccelerateSimdOps<float, simd_float16>,
            16>(in, out);
      } else {
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        softmax<
            float16_t,
            float16_t,
            float16x8_t,
            NeonFp16SimdOps<float16_t, float16x8_t>,
            8>(in, out);
 #else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        eval(inputs, out); // Redirect to common backend for consistency
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
      }
      break;
    case bfloat16:
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@@ -1,8 +1,8 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.
 #pragma once
-#include <vecLib/BNNS/bnns.h>
+#include <Accelerate/Accelerate.h>
 #include "mlx/dtype.h"
 namespace mlx::core {
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -37,15 +37,20 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
@@ -54,6 +59,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -196,6 +196,20 @@ void LogAddExp::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalAnd());
 }
 void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalOr());
 }
 void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -236,4 +250,82 @@ void Subtract::eval(const std::vector<array>& inputs, array& out) {
  binary(a, b, out, detail::Subtract());
 }
 void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto dispatch_type = [&a, &b, &out](auto op) {
    switch (out.dtype()) {
      case bool_:
        binary_op<bool>(a, b, out, op);
      case uint8:
        binary_op<uint8_t>(a, b, out, op);
        break;
      case uint16:
        binary_op<uint16_t>(a, b, out, op);
        break;
      case uint32:
        binary_op<uint32_t>(a, b, out, op);
        break;
      case uint64:
        binary_op<uint64_t>(a, b, out, op);
        break;
      case int8:
        binary_op<int8_t>(a, b, out, op);
        break;
      case int16:
        binary_op<int16_t>(a, b, out, op);
        break;
      case int32:
        binary_op<int32_t>(a, b, out, op);
        break;
      case int64:
        binary_op<int64_t>(a, b, out, op);
        break;
      default:
        throw std::runtime_error(
            "[BitwiseBinary::eval_cpu] Type not supported");
        break;
    }
  };
  switch (op_) {
    case BitwiseBinary::And:
      dispatch_type(detail::BitwiseAnd());
      break;
    case BitwiseBinary::Or:
      dispatch_type(detail::BitwiseOr());
      break;
    case BitwiseBinary::Xor:
      dispatch_type(detail::BitwiseXor());
      break;
    case BitwiseBinary::LeftShift:
      dispatch_type(detail::LeftShift());
      break;
    case BitwiseBinary::RightShift:
      dispatch_type(detail::RightShift());
      break;
  }
 }
 void ArcTan2::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
  if (out.dtype() == float32) {
    binary_op<float>(a, b, out, detail::ArcTan2());
  } else if (out.dtype() == float16) {
    binary_op<float16_t>(a, b, out, detail::ArcTan2());
  } else if (out.dtype() == bfloat16) {
    binary_op<bfloat16_t>(a, b, out, detail::ArcTan2());
  } else if (issubdtype(out.dtype(), inexact)) {
    std::ostringstream err;
    err << "[arctan2] Does not support " << out.dtype();
    throw std::invalid_argument(err.str());
  } else {
    throw std::invalid_argument(
        "[arctan2] Cannot compute inverse tangent for arrays"
        " with non floating point type.");
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,6 +1,8 @@
 // Copyright © 2023 Apple Inc.
 #pragma once
 #include <cassert>
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -0,0 +1,101 @@
 // Copyright © 2023-2024 Apple Inc.
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"
 #ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
 #include <lapack.h>
 #endif
 namespace mlx::core {
 namespace {
 // Delegate to the Cholesky factorization taking into account differences in
 // LAPACK implementations (basically how to pass the 'uplo' string to fortran).
 int spotrf_wrapper(char uplo, float* matrix, int N) {
  int info;
 #ifdef LAPACK_FORTRAN_STRLEN_END
  spotrf_(
      /* uplo = */ &uplo,
      /* n = */ &N,
      /* a = */ matrix,
      /* lda = */ &N,
      /* info = */ &info,
      /* uplo_len = */ static_cast<size_t>(1));
 #else
  spotrf_(
      /* uplo = */ &uplo,
      /* n = */ &N,
      /* a = */ matrix,
      /* lda = */ &N,
      /* info = */ &info);
 #endif
  return info;
 }
 } // namespace
 void cholesky_impl(const array& a, array& factor, bool upper) {
  // Lapack uses the column-major convention. We take advantage of the fact that
  // the matrix should be symmetric:
  //   (A)ᵀ = A
  // and that a column-major lower triangular matrix is a row-major upper
  // triangular matrix, so uplo is the opposite of what we would expect from
  // upper
  char uplo = (upper) ? 'L' : 'U';
  // The decomposition is computed in place, so just copy the input to the
  // output.
  copy(
      a,
      factor,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);
  float* matrix = factor.data<float>();
  for (int i = 0; i < num_matrices; i++) {
    // Compute Cholesky factorization.
    int info = spotrf_wrapper(uplo, matrix, N);
    // TODO: We do nothing when the matrix is not positive semi-definite
    // because throwing an error would result in a crash. If we figure out how
    // to catch errors from the implementation we should throw.
    if (info < 0) {
      std::stringstream msg;
      msg << "[cholesky] Cholesky decomposition failed with error code "
          << info;
      throw std::runtime_error(msg.str());
    }
    // Zero out the upper/lower triangle while advancing the pointer to the
    // next matrix at the same time.
    for (int row = 0; row < N; row++) {
      if (upper) {
        std::fill(matrix, matrix + row, 0);
      } else {
        std::fill(matrix + row + 1, matrix + N, 0);
      }
      matrix += N;
    }
  }
 }
 void Cholesky::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Cholesky::eval] only supports float32.");
  }
  cholesky_impl(inputs[0], output, upper_);
 }
 } // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -0,0 +1,304 @@
 // Copyright © 2024 Apple Inc.
 #include <cassert>
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"
 namespace mlx::core {
 void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (!in.flags().row_contiguous) {
    // Just ensuring that inputs[0] came from the ops which would ensure the
    // input is row contiguous.
    throw std::runtime_error(
        "AsStrided must be used with row contiguous arrays only.");
  }
  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
  size_t r = 1, c = 1;
  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
    c *= shape_[j];
  }
  auto flags = in.flags();
  // TODO: Compute the contiguous flag in a better way cause now we are
  //       unnecessarily strict.
  flags.contiguous = row_contiguous || col_contiguous;
  flags.row_contiguous = row_contiguous;
  flags.col_contiguous = col_contiguous;
  // There is no easy way to compute the actual data size so we use out.size().
  // The contiguous flag will almost certainly not be set so no code should
  // rely on data_size anyway.
  size_t data_size = out.size();
  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }
 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
  }
  std::vector<size_t> strides(out.ndim(), 0);
  int diff = out.ndim() - in.ndim();
  for (int i = in.ndim() - 1; i >= 0; --i) {
    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
  }
  auto flags = in.flags();
  if (out.size() > in.size()) {
    flags.row_contiguous = flags.col_contiguous = false;
  }
  out.copy_shared_buffer(in, strides, flags, in.data_size());
 }
 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.copy_shared_buffer(inputs[0]);
 }
 void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
 }
 void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0; i < outputs.size(); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }
 void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
  double numel = 1;
  for (auto ax : axes_) {
    numel *= inputs[0].shape(ax);
  }
  if (inverted_) {
    numel = 1.0 / numel;
  }
  switch (out.dtype()) {
    case bool_:
      *out.data<bool>() = static_cast<bool>(numel);
      break;
    case uint8:
      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
      break;
    case uint16:
      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
      break;
    case uint32:
      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
      break;
    case uint64:
      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
      break;
    case int8:
      *out.data<int8_t>() = static_cast<int8_t>(numel);
      break;
    case int16:
      *out.data<int16_t>() = static_cast<int16_t>(numel);
      break;
    case int32:
      *out.data<int32_t>() = static_cast<int32_t>(numel);
      break;
    case int64:
      *out.data<int64_t>() = static_cast<int64_t>(numel);
      break;
    case float16:
      *out.data<float16_t>() = static_cast<float16_t>(numel);
      break;
    case float32:
      *out.data<float>() = static_cast<float>(numel);
      break;
    case bfloat16:
      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
      break;
    case complex64:
      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
      break;
  }
 }
 std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
    const array& in,
    const array& out) {
  // Special case for empty arrays or row contiguous arrays
  if (in.size() == 0 || in.flags().row_contiguous) {
    return {false, out.strides()};
  }
  // Special case for scalars
  if (in.ndim() == 0) {
    std::vector<size_t> out_strides(out.ndim(), 0);
    return {false, out_strides};
  }
  // Firstly let's collapse all the contiguous dimensions of the input
  auto [shape, _strides] = collapse_contiguous_dims(in);
  auto& strides = _strides[0];
  // If shapes fit exactly in the contiguous dims then no copy is necessary so
  // let's check.
  std::vector<size_t> out_strides;
  bool copy_necessary = false;
  int j = 0;
  for (int i = 0; i < out.ndim(); i++) {
    int N = out.shape(i);
    if (j < shape.size() && shape[j] % N == 0) {
      shape[j] /= N;
      out_strides.push_back(shape[j] * strides[j]);
      j += (shape[j] == 1);
    } else if (N == 1) {
      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
      out_strides.push_back(out_strides.back());
    } else {
      copy_necessary = true;
      break;
    }
  }
  return {copy_necessary, out_strides};
 }
 void Reshape::shared_buffer_reshape(
    const array& in,
    const std::vector<size_t>& out_strides,
    array& out) {
  auto flags = in.flags();
  if (flags.row_contiguous) {
    // For row contiguous reshapes:
    // - Shallow copy the buffer
    // - If reshaping into a vector (all singleton dimensions except one) it
    //    becomes col contiguous again.
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
  }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }
 void Split::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
                              size_t in_data_size,
                              auto flags) {
    size_t data_size = 1;
    size_t f_stride = 1;
    size_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
      f_stride *= shape[i];
      b_stride *= shape[ri];
      if (strides[i] > 0) {
        data_size *= shape[i];
      }
    }
    if (data_size == 1) {
      // Broadcasted scalar array is contiguous.
      flags.contiguous = true;
    } else if (data_size == in_data_size) {
      // Means we sliced a broadcasted dimension so leave the "no holes" flag
      // alone.
    } else {
      // We sliced something. So either we are row or col contiguous or we
      // punched a hole.
      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
    }
    return std::pair<decltype(flags), size_t>{flags, data_size};
  };
  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
  for (int i = 0; i < indices.size(); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
    outputs[i].copy_shared_buffer(
        in, in.strides(), new_flags, data_size, offset);
  }
 }
 std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
    const array& in) {
  int64_t data_offset = 0;
  std::vector<int64_t> inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices_[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides_[i];
  }
  return std::make_tuple(data_offset, inp_strides);
 }
 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.copy_shared_buffer(inputs[0]);
 }
 void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  std::vector<size_t> out_strides(out.ndim());
  auto& in = inputs[0];
  for (int ax = 0; ax < axes_.size(); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
  }
  // Conditions for {row/col}_contiguous
  // - array must be contiguous (no gaps)
  // - underlying buffer size should have the same size as the array
  // - cumulative product of shapes is equal to the strides (we can ignore axes
  //   with size == 1)
  //   - in the forward direction (column contiguous)
  //   - in the reverse direction (row contiguous)
  // - vectors are both row and col contiguous (hence if both row/col are
  //   true, they stay true)
  auto flags = in.flags();
  if (flags.contiguous && in.data_size() == in.size()) {
    size_t f_stride = 1;
    size_t b_stride = 1;
    flags.col_contiguous = true;
    flags.row_contiguous = true;
    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
      f_stride *= out.shape(i);
      flags.row_contiguous &=
          (out_strides[ri] == b_stride || out.shape(ri) == 1);
      b_stride *= out.shape(ri);
    }
  }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }
 } // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -205,8 +205,8 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Correct size
      // - Not a constant
-      if (in.flags().row_contiguous && in.nbytes() == outputs[o].nbytes() &&
+      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
-          in.is_donatable() &&
+          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
        if (move_buffers) {
          outputs[o].move_shared_buffer(
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -38,11 +38,15 @@ void slow_conv_1D(
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
  const int C = in.shape(2); // Input channels
  const int oH = out.shape(1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(2); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int groups = C / wt.shape(2);
  const int C_per_group = wt.shape(2);
  const int O_per_group = O / groups;
  const size_t in_stride_N = in.strides()[0];
  const size_t in_stride_H = in.strides()[1];
  const size_t in_stride_C = in.strides()[2];
@@ -57,7 +61,8 @@ void slow_conv_1D(
  for (int n = 0; n < N; ++n) {
    for (int oh = 0; oh < oH; ++oh) {
-      for (int o = 0; o < O; ++o) {
+      for (int g = 0; g < groups; ++g) {
        for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
          const T* filter_wt_ptr = start_wt_ptr + o * wt_stride_O;
          float r = 0.;
@@ -70,10 +75,10 @@ void slow_conv_1D(
            auto ih_div = std::div(ih, in_dilation[0]);
            if (ih >= 0 && ih < iH && ih_div.rem == 0) {
-            for (int c = 0; c < C; ++c) {
+              for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
                r += static_cast<float>(
                         in_ptr[ih_div.quot * in_stride_H + c * in_stride_C]) *
-                  static_cast<float>(wt_ptr[c * wt_stride_C]);
+                    static_cast<float>(wt_ptr[(c % C_per_group) * wt_stride_C]);
              } // c
            } // ih check
@@ -81,11 +86,11 @@ void slow_conv_1D(
          out_ptr[oh * out_stride_H + o * out_stride_O] = static_cast<T>(r);
        } // o
      } // g
    } // oh
    in_ptr += in_stride_N;
    out_ptr += out_stride_N;
  } // n
 }
@@ -106,13 +111,17 @@ void slow_conv_2D(
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
  const int iW = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
  const int C = in.shape(3); // In channels
  const int oH = out.shape(1); // Output spatial dim
  const int oW = out.shape(2); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(3); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int wW = wt.shape(2); // Weight spatial dim
  const int groups = C / wt.shape(3);
  const int C_per_group = wt.shape(3);
  const int O_per_group = O / groups;
  const size_t in_stride_N = in.strides()[0];
  const size_t in_stride_H = in.strides()[1];
  const size_t in_stride_W = in.strides()[2];
@@ -136,7 +145,8 @@ void slow_conv_2D(
        int ih_base = oh * wt_strides[0] - padding[0];
        int iw_base = ow * wt_strides[1] - padding[1];
-        for (int o = 0; o < O; ++o) {
+        for (int g = 0; g < groups; ++g) {
          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
            float r = 0.;
            for (int wh = 0; wh < wH; ++wh) {
@@ -146,16 +156,16 @@ void slow_conv_2D(
                int ih = ih_base + wh_flip * wt_dilation[0];
                int iw = iw_base + ww_flip * wt_dilation[1];
-              const T* wt_ptr_pt = wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+                const T* wt_ptr_pt =
-              const T* in_ptr_pt = in_ptr + ih * in_stride_H + iw * in_stride_W;
+                    wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
                const T* in_ptr_pt =
                    in_ptr + ih * in_stride_H + iw * in_stride_W;
-              for (int c = 0; c < C; ++c) {
+                for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
-                r += static_cast<float>(in_ptr_pt[0]) *
+                  r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                    static_cast<float>(wt_ptr_pt[0]);
+                      static_cast<float>(
-                in_ptr_pt += in_stride_C;
+                           wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
                wt_ptr_pt += wt_stride_C;
                } // c
              } // ww
            } // wh
@@ -163,6 +173,7 @@ void slow_conv_2D(
            out_ptr += out_stride_O;
            wt_ptr += wt_stride_O;
          } // o
        } // g
      };
  int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
@@ -214,7 +225,8 @@ void slow_conv_2D(
        int wh_base = base_h[oh % f_out_jump_h];
        int ww_base = base_w[ow % f_out_jump_w];
-        for (int o = 0; o < O; ++o) {
+        for (int g = 0; g < groups; ++g) {
          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
            float r = 0.;
            for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
@@ -234,11 +246,11 @@ void slow_conv_2D(
                  const T* in_ptr_pt =
                      in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;
-                for (int c = 0; c < C; ++c) {
+                  for (int c = g * C_per_group; c < (g + 1) * C_per_group;
-                  r += static_cast<float>(in_ptr_pt[0]) *
+                       ++c) {
-                      static_cast<float>(wt_ptr_pt[0]);
+                    r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                  in_ptr_pt += in_stride_C;
+                        static_cast<float>(
-                  wt_ptr_pt += wt_stride_C;
+                             wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
                  } // c
                } // ih, iw check
@@ -249,6 +261,7 @@ void slow_conv_2D(
            out_ptr += out_stride_O;
            wt_ptr += wt_stride_O;
          } // o
        } // g
      };
  int oH_border_0 = 0;
@@ -305,6 +318,296 @@ void slow_conv_2D(
  } // n
 }
 template <typename T>
 void slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
  const T* st_wt_ptr = wt.data<T>();
  const T* st_in_ptr = in.data<T>();
  T* st_out_ptr = out.data<T>();
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iD = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
  const int iH = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
  const int iW = 1 + in_dilation[2] * (in.shape(3) - 1); // Input spatial dim
  const int oD = out.shape(1); // Output spatial dim
  const int oH = out.shape(2); // Output spatial dim
  const int oW = out.shape(3); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(4); // In channels
  const int wD = wt.shape(1); // Weight spatial dim
  const int wH = wt.shape(2); // Weight spatial dim
  const int wW = wt.shape(3); // Weight spatial dim
  const size_t in_stride_N = in.strides()[0];
  const size_t in_stride_D = in.strides()[1];
  const size_t in_stride_H = in.strides()[2];
  const size_t in_stride_W = in.strides()[3];
  const size_t in_stride_C = in.strides()[4];
  const size_t wt_stride_O = wt.strides()[0];
  const size_t wt_stride_D = wt.strides()[1];
  const size_t wt_stride_H = wt.strides()[2];
  const size_t wt_stride_W = wt.strides()[3];
  const size_t wt_stride_C = wt.strides()[4];
  const size_t out_stride_N = out.strides()[0];
  const size_t out_stride_D = out.strides()[1];
  const size_t out_stride_H = out.strides()[2];
  const size_t out_stride_W = out.strides()[3];
  const size_t out_stride_O = out.strides()[4];
  bool is_idil_one =
      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1;
  auto pt_conv_no_checks = [&](const T* in_ptr,
                               const T* wt_ptr,
                               T* out_ptr,
                               int od,
                               int oh,
                               int ow) {
    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
    int id_base = od * wt_strides[0] - padding[0];
    int ih_base = oh * wt_strides[1] - padding[1];
    int iw_base = ow * wt_strides[2] - padding[2];
    for (int o = 0; o < O; ++o) {
      float r = 0.;
      for (int wd = 0; wd < wD; ++wd) {
        for (int wh = 0; wh < wH; ++wh) {
          for (int ww = 0; ww < wW; ++ww) {
            int wd_flip = flip ? wD - wd - 1 : wd;
            int wh_flip = flip ? wH - wh - 1 : wh;
            int ww_flip = flip ? wW - ww - 1 : ww;
            int id = id_base + wd_flip * wt_dilation[0];
            int ih = ih_base + wh_flip * wt_dilation[1];
            int iw = iw_base + ww_flip * wt_dilation[2];
            const T* wt_ptr_pt =
                wt_ptr + wd * wt_stride_D + wh * wt_stride_H + ww * wt_stride_W;
            const T* in_ptr_pt =
                in_ptr + id * in_stride_D + ih * in_stride_H + iw * in_stride_W;
            for (int c = 0; c < C; ++c) {
              r += static_cast<float>(in_ptr_pt[0]) *
                  static_cast<float>(wt_ptr_pt[0]);
              in_ptr_pt += in_stride_C;
              wt_ptr_pt += wt_stride_C;
            } // c
          } // ww
        } // wh
      } // wd
      out_ptr[0] = static_cast<T>(r);
      out_ptr += out_stride_O;
      wt_ptr += wt_stride_O;
    } // o
  };
  int jump_d = flip ? -wt_dilation[0] : wt_dilation[0];
  int jump_h = flip ? -wt_dilation[1] : wt_dilation[1];
  int jump_w = flip ? -wt_dilation[2] : wt_dilation[2];
  int init_d = (flip ? (wD - 1) * wt_dilation[0] : 0);
  int init_h = (flip ? (wH - 1) * wt_dilation[1] : 0);
  int init_w = (flip ? (wW - 1) * wt_dilation[2] : 0);
  int f_wgt_jump_d = std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
  int f_wgt_jump_h = std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
  int f_wgt_jump_w = std::lcm(in_dilation[2], wt_dilation[2]) / wt_dilation[2];
  int f_out_jump_d = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
  int f_out_jump_h = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
  int f_out_jump_w = std::lcm(in_dilation[2], wt_strides[2]) / wt_strides[2];
  std::vector<int> base_d(f_out_jump_d);
  std::vector<int> base_h(f_out_jump_h);
  std::vector<int> base_w(f_out_jump_w);
  for (int i = 0; i < f_out_jump_d; ++i) {
    int id_loop = i * wt_strides[0] - padding[0] + init_d;
    int wd_base = 0;
    while (wd_base < wD && id_loop % in_dilation[0] != 0) {
      wd_base++;
      id_loop += jump_d;
    }
    base_d[i] = wd_base;
  }
  for (int i = 0; i < f_out_jump_h; ++i) {
    int ih_loop = i * wt_strides[1] - padding[1] + init_h;
    int wh_base = 0;
    while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
      wh_base++;
      ih_loop += jump_h;
    }
    base_h[i] = wh_base;
  }
  for (int j = 0; j < f_out_jump_w; ++j) {
    int iw_loop = j * wt_strides[2] - padding[2] + init_w;
    int ww_base = 0;
    while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
      ww_base++;
      iw_loop += jump_w;
    }
    base_w[j] = ww_base;
  }
  auto pt_conv_all_checks = [&](const T* in_ptr,
                                const T* wt_ptr,
                                T* out_ptr,
                                int od,
                                int oh,
                                int ow) {
    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
    int id_base = od * wt_strides[0] - padding[0];
    int ih_base = oh * wt_strides[1] - padding[1];
    int iw_base = ow * wt_strides[2] - padding[2];
    int wd_base = base_d[od % f_out_jump_d];
    int wh_base = base_h[oh % f_out_jump_h];
    int ww_base = base_w[ow % f_out_jump_w];
    for (int o = 0; o < O; ++o) {
      float r = 0.;
      for (int wd = wd_base; wd < wD; wd += f_wgt_jump_d) {
        for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
          for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
            int wd_flip = flip ? wD - wd - 1 : wd;
            int wh_flip = flip ? wH - wh - 1 : wh;
            int ww_flip = flip ? wW - ww - 1 : ww;
            int id = id_base + wd_flip * wt_dilation[0];
            int ih = ih_base + wh_flip * wt_dilation[1];
            int iw = iw_base + ww_flip * wt_dilation[2];
            if (id >= 0 && id < iD && ih >= 0 && ih < iH && iw >= 0 &&
                iw < iW) {
              const T* wt_ptr_pt = wt_ptr + wd * wt_stride_D +
                  wh * wt_stride_H + ww * wt_stride_W;
              int id_dil = !is_idil_one ? (id / in_dilation[0]) : id;
              int ih_dil = !is_idil_one ? (ih / in_dilation[1]) : ih;
              int iw_dil = !is_idil_one ? (iw / in_dilation[2]) : iw;
              const T* in_ptr_pt = in_ptr + id_dil * in_stride_D +
                  ih_dil * in_stride_H + iw_dil * in_stride_W;
              for (int c = 0; c < C; ++c) {
                r += static_cast<float>(in_ptr_pt[0]) *
                    static_cast<float>(wt_ptr_pt[0]);
                in_ptr_pt += in_stride_C;
                wt_ptr_pt += wt_stride_C;
              } // c
            } // iD, ih, iw check
          } // ww
        } // wh
      } // wd
      out_ptr[0] = static_cast<T>(r);
      out_ptr += out_stride_O;
      wt_ptr += wt_stride_O;
    } // o
  };
  int oD_border_0 = 0;
  int oD_border_1 =
      is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oD;
  int oD_border_2 = std::max(
      oD_border_1, (iD + padding[0] - wD * wt_dilation[0]) / wt_strides[0]);
  int oD_border_3 = oD;
  int oH_border_0 = 0;
  int oH_border_1 =
      is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oH;
  int oH_border_2 = std::max(
      oH_border_1, (iH + padding[1] - wH * wt_dilation[1]) / wt_strides[1]);
  int oH_border_3 = oH;
  int oW_border_0 = 0;
  int oW_border_1 =
      is_idil_one ? ((padding[2] + wt_strides[2] - 1) / wt_strides[2]) : oW;
  int oW_border_2 = std::max(
      oW_border_1, (iW + padding[2] - wW * wt_dilation[2]) / wt_strides[2]);
  int oW_border_3 = oW;
  for (int n = 0; n < N; ++n) {
    // Case 1: od might put us out of bounds
    for (int od = oD_border_0; od < oD_border_1; ++od) {
      for (int oh = 0; oh < oH; ++oh) {
        for (int ow = 0; ow < oW; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
      } // oh
    } // od
    // Case 2: od in bounds
    for (int od = oD_border_1; od < oD_border_2; ++od) {
      // Case 2.1: oh might put us out of bounds
      for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
        for (int ow = 0; ow < oW; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
      } // oh
      // Case 2.2: oh in bounds
      for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
        // Case 2.2.1: ow might put us out of bounds
        for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
        // Case 2.2.2: ow in bounds
        for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
          pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
        // Case 2.2.3: ow might put us out of bounds
        for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
      } // oh
      // Case 2.3: oh might put us out of bounds
      for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
        for (int ow = 0; ow < oW; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
      } // oh
    } // od
    // Case 3: od might put us out of bounds
    for (int od = oD_border_2; od < oD_border_3; ++od) {
      for (int oh = 0; oh < oH; ++oh) {
        for (int ow = 0; ow < oW; ++ow) {
          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
        } // ow
      } // oh
    } // od
    st_in_ptr += in_stride_N;
    st_out_ptr += out_stride_N;
  } // n
 }
 void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
@@ -353,6 +656,30 @@ void dispatch_slow_conv_2D(
  }
 }
 void dispatch_slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
  if (in.dtype() == float32) {
    return slow_conv_3D<float>(
        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
  } else if (in.dtype() == float16) {
    return slow_conv_3D<float16_t>(
        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
  } else if (in.dtype() == bfloat16) {
    return slow_conv_3D<bfloat16_t>(
        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
  } else {
    throw std::invalid_argument(
        "[Convolution::eval] got unsupported data type.");
  }
 }
 ///////////////////////////////////////////////////////////////////////////////
 // Explicit gemm conv
 ///////////////////////////////////////////////////////////////////////////////
@@ -366,11 +693,15 @@ void explicit_gemm_conv_1D_cpu(
    const std::vector<int>& wt_dilation) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = in.shape(1); // Input spatial dim
  const int C = in.shape(2); // Input channels
  const int oH = out.shape(1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(2); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int groups = C / wt.shape(2);
  const int C_per_group = wt.shape(2);
  const int O_per_group = O / groups;
  auto conv_dtype = float32;
  // Pad input
@@ -402,6 +733,11 @@ void explicit_gemm_conv_1D_cpu(
      in_padded.strides()[1],
      in_padded.strides()[2]};
  auto flags = in_padded.flags();
  if (groups > 1) {
    // Transpose the last two dimensions for grouped convolutions
    std::swap(strided_shape[2], strided_shape[3]);
    std::swap(strided_strides[2], strided_strides[3]);
  }
  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
  in_strided_view.copy_shared_buffer(
@@ -416,7 +752,19 @@ void explicit_gemm_conv_1D_cpu(
  auto gemm_wt = wt;
  auto gemm_out = out;
-  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
+  if (groups > 1) {
    // Transpose the last two dimensions for grouped convolutions
    array wt_transpose(
        {wt.shape(0), wt.shape(2), wt.shape(1)}, wt.dtype(), nullptr, {});
    wt_transpose.copy_shared_buffer(
        wt,
        {wt.strides(0), wt.strides(2), wt.strides(1)},
        wt.flags(),
        wt.size(),
        0);
    gemm_wt = array(wt_transpose.shape(), float32, nullptr, {});
    copy(wt_transpose, gemm_wt, CopyType::General);
  } else if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
@@ -428,21 +776,22 @@ void explicit_gemm_conv_1D_cpu(
    gemm_out.set_data(allocator::malloc_or_wait(gemm_out.nbytes()));
  }
  for (int g = 0; g < groups; ++g) {
    // Perform gemm
    cblas_sgemm(
        CblasRowMajor,
        CblasNoTrans, // no trans A
        CblasTrans, // transB
        strided_reshape[0], // M
-      O, // N
+        O_per_group, // N
-      strided_reshape[1], // K
+        C_per_group * wH, // K
        1.0f, // alpha
-      in_strided.data<float>(),
+        in_strided.data<float>() + g * C_per_group * wH, // A
-      strided_reshape[1], // lda
+        wH * C, // lda
-      gemm_wt.data<float>(),
+        gemm_wt.data<float>() + g * O_per_group * C_per_group * wH, // B
-      strided_reshape[1], // ldb
+        wH * C_per_group, // ldb
        0.0f, // beta
-      gemm_out.data<float>(),
+        gemm_out.data<float>() + g * O_per_group, // C
        O // ldc
    );
@@ -451,6 +800,7 @@ void explicit_gemm_conv_1D_cpu(
      copy(gemm_out, out, CopyType::Vector);
    }
  }
 }
 void explicit_gemm_conv_2D_cpu(
    const array& in,
@@ -554,6 +904,131 @@ void explicit_gemm_conv_2D_cpu(
  }
 }
 void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const auto iDim = std::vector<int>(
      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
  const auto oDim = std::vector<int>(
      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(-1); // In channels
  const auto wDim = std::vector<int>(
      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
  auto conv_dtype = float32;
  // Pad input
  std::vector<int> padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
  }
  padded_shape.back() = C;
  array in_padded(padded_shape, conv_dtype, nullptr, {});
  // Fill with zeros
  copy(array(0, conv_dtype), in_padded, CopyType::Scalar);
  // Pick input slice from padded
  size_t data_offset = 0;
  for (size_t i = 0; i < padding.size(); i++) {
    data_offset += padding[i] * in_padded.strides()[i + 1];
  }
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
      in_padded.strides(),
      in_padded.flags(),
      in_padded_slice.size(),
      data_offset);
  // Copy input values into the slice
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
  // Make strided view
  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
  for (size_t i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
  }
  for (size_t i = 0; i < wDim.size(); i++) {
    strided_shape[i + 1 + oDim.size()] = wDim[i];
  }
  strided_shape.back() = C;
  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
  for (size_t i = 0; i < wt_strides.size(); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
  }
  for (size_t i = 1; i < in_padded.strides().size(); i++) {
    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
  }
  auto flags = in_padded.flags();
  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
  in_strided_view.copy_shared_buffer(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);
  // Materialize strided view
  std::vector<int> strided_reshape = {N, C};
  for (const auto& o : oDim) {
    strided_reshape[0] *= o;
  }
  for (const auto& w : wDim) {
    strided_reshape[1] *= w;
  }
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);
  // Check wt dtype and prepare
  auto gemm_wt = wt;
  auto gemm_out = out;
  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
    copy(wt, gemm_wt, ctype);
  }
  if (out.dtype() != float32) {
    gemm_out = array(out.shape(), float32, nullptr, {});
    gemm_out.set_data(allocator::malloc_or_wait(gemm_out.nbytes()));
  }
  // Perform gemm
  cblas_sgemm(
      CblasRowMajor,
      CblasNoTrans, // no trans A
      CblasTrans, // transB
      strided_reshape[0], // M
      O, // N
      strided_reshape[1], // K
      1.0f, // alpha
      in_strided.data<float>(),
      strided_reshape[1], // lda
      gemm_wt.data<float>(),
      strided_reshape[1], // ldb
      0.0f, // beta
      gemm_out.data<float>(),
      O // ldc
  );
  // Copy results if needed
  if (out.dtype() != float32) {
    copy(gemm_out, out, CopyType::Vector);
  }
 }
 ///////////////////////////////////////////////////////////////////////////////
 // Conv routing
 ///////////////////////////////////////////////////////////////////////////////
@@ -589,6 +1064,19 @@ void conv_2D_cpu(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }
 void conv_3D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
  return dispatch_slow_conv_3D(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }
 } // namespace
 void Convolution::eval(const std::vector<array>& inputs, array& out) {
@@ -597,8 +1085,20 @@ void Convolution::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  auto& wt = inputs[1];
  // 3D convolution
  if (in.ndim() == (3 + 2)) {
    return conv_3D_cpu(
        in,
        wt,
        out,
        padding_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        flip_);
  }
  // 2D convolution
-  if (in.ndim() == (2 + 2)) {
+  else if (in.ndim() == (2 + 2)) {
    return conv_2D_cpu(
        in,
        wt,
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -4,6 +4,7 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
 namespace mlx::core {
@@ -142,29 +143,31 @@ void copy_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    int64_t i_offset) {
-  switch (src.ndim()) {
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
      data_shape, std::vector<std::vector<stride_t>>{i_strides});
  switch (new_shape.size()) {
    case 1:
      copy_general_dim1<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 2:
      copy_general_dim2<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 3:
      copy_general_dim3<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
    case 4:
      copy_general_dim4<SrcT, DstT, stride_t>(
-          src, dst, data_shape, i_strides, i_offset);
+          src, dst, new_shape, new_strides[0], i_offset);
      return;
  }
  auto src_ptr = src.data<SrcT>() + i_offset;
  auto dst_ptr = dst.data<DstT>();
  for (size_t i = 0; i < dst.size(); ++i) {
-    stride_t src_elem = elem_to_loc(i, data_shape, i_strides);
+    stride_t src_elem = elem_to_loc(i, new_shape, new_strides[0]);
    dst_ptr[i] = static_cast<DstT>(src_ptr[src_elem]);
  }
 }
@@ -195,10 +198,10 @@ inline void copy_general_general_dims(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    stride_t i_offset,
+    int64_t i_offset,
-    stride_t o_offset) {
+    int64_t o_offset) {
  if constexpr (D > 1) {
-    int axis = src.ndim() - D;
+    int axis = data_shape.size() - D;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -209,7 +212,7 @@ inline void copy_general_general_dims(
      o_offset += stride_dst;
    }
  } else {
-    int axis = src.ndim() - 1;
+    int axis = data_shape.size() - 1;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -230,38 +233,76 @@ void copy_general_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    stride_t i_offset,
+    int64_t i_offset,
-    stride_t o_offset) {
+    int64_t o_offset) {
-  switch (src.ndim()) {
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
      data_shape, std::vector<std::vector<stride_t>>{i_strides, o_strides});
  switch (new_shape.size()) {
    case 1:
      copy_general_general_dims<SrcT, DstT, stride_t, 1>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
          dst,
          new_shape,
          new_strides[0],
          new_strides[1],
          i_offset,
          o_offset);
      return;
    case 2:
      copy_general_general_dims<SrcT, DstT, stride_t, 2>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
          dst,
          new_shape,
          new_strides[0],
          new_strides[1],
          i_offset,
          o_offset);
      return;
    case 3:
      copy_general_general_dims<SrcT, DstT, stride_t, 3>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
          dst,
          new_shape,
          new_strides[0],
          new_strides[1],
          i_offset,
          o_offset);
      return;
    case 4:
      copy_general_general_dims<SrcT, DstT, stride_t, 4>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
          dst,
          new_shape,
          new_strides[0],
          new_strides[1],
          i_offset,
          o_offset);
      return;
    case 5:
      copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
+          src,
          dst,
          new_shape,
          new_strides[0],
          new_strides[1],
          i_offset,
          o_offset);
      return;
  }
  int size = std::accumulate(
-      data_shape.begin() - 5, data_shape.end(), 1, std::multiplies<int>());
+      new_shape.end() - 5, new_shape.end(), 1, std::multiplies<int>());
  for (int i = 0; i < src.size(); i += size) {
-    stride_t src_offset = i_offset + elem_to_loc(i, data_shape, i_strides);
+    stride_t src_offset = i_offset + elem_to_loc(i, new_shape, new_strides[0]);
-    stride_t dst_offset = o_offset + elem_to_loc(i, dst.shape(), o_strides);
+    stride_t dst_offset = o_offset + elem_to_loc(i, new_shape, new_strides[1]);
    copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-        src, dst, data_shape, i_strides, o_strides, src_offset, dst_offset);
+        src,
        dst,
        new_shape,
        new_strides[0],
        new_strides[1],
        src_offset,
        dst_offset);
  }
 }
@@ -444,8 +485,17 @@ void copy_inplace(
  }
 }
-template <>
+template void copy_inplace<size_t>(
-void copy_inplace<int64_t>(
+    const array& src,
    array& dst,
    const std::vector<int>& data_shape,
    const std::vector<size_t>& i_strides,
    const std::vector<size_t>& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype);
 template void copy_inplace<int64_t>(
    const array& src,
    array& dst,
    const std::vector<int>& data_shape,
@@ -453,24 +503,6 @@ void copy_inplace<int64_t>(
    const std::vector<int64_t>& o_strides,
    int64_t i_offset,
    int64_t o_offset,
-    CopyType ctype) {
+    CopyType ctype);
  switch (ctype) {
    case CopyType::General:
    case CopyType::GeneralGeneral:
      return copy_inplace_dispatch(
          src,
          dst,
          ctype,
          data_shape,
          i_strides,
          o_strides,
          i_offset,
          o_offset);
    case CopyType::Scalar:
    case CopyType::Vector:
      return copy_inplace_dispatch(src, dst, ctype);
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -5,7 +5,6 @@
 #else
 #include <cblas.h>
 #endif
 #include <cstring>
 #include "mlx/array.h"
@@ -34,6 +33,7 @@ DEFAULT(ArcCosh)
 DEFAULT(ArcSin)
 DEFAULT(ArcSinh)
 DEFAULT(ArcTan)
 DEFAULT(ArcTan2)
 DEFAULT(ArcTanh)
 DEFAULT(ArgPartition)
 DEFAULT(ArgReduce)
@@ -41,14 +41,18 @@ DEFAULT(ArgSort)
 DEFAULT(AsType)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
 DEFAULT(BlockMaskedMM)
 DEFAULT(GatherMM)
 DEFAULT(GatherQMM)
 DEFAULT_MULTI(DivMod)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
 DEFAULT(Conjugate)
 DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
 DEFAULT(Cosh)
-DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(CustomTransforms)
 DEFAULT_MULTI(Depends)
 DEFAULT(Divide)
 DEFAULT(NumberOfElements)
@@ -64,6 +68,7 @@ DEFAULT(Full)
 DEFAULT(Gather)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
 DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
@@ -108,6 +113,7 @@ DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 DEFAULT(Cholesky)
 namespace {
--- a/mlx/backend/common/hadamard.cpp
+++ b/mlx/backend/common/hadamard.cpp
@@ -0,0 +1,107 @@
 // Copyright © 2024 Apple Inc.
 #include <cassert>
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/hadamard.h"
 #include "mlx/primitives.h"
 namespace mlx::core {
 // n = 2^k component
 template <typename T>
 void hadamard_n(array& out, int n, int m, float scale) {
  for (int b = 0; b < out.size() / n; b++) {
    size_t loc = b * n;
    T* data_ptr = out.data<T>() + loc;
    int h = 1;
    int n_over_2 = n / 2;
    while (h < n) {
      for (int i = 0; i < n / 2; i++) {
        int k = i & (h - 1);
        int j = ((i - k) << 1) + k;
        float x = *(data_ptr + j);
        float y = *(data_ptr + j + h);
        *(data_ptr + j) = x + y;
        *(data_ptr + j + h) = x - y;
        if (h == n_over_2) {
          *(data_ptr + j) *= scale;
          *(data_ptr + j + h) *= scale;
        }
      }
      h <<= 1;
    }
  }
 }
 // m component
 template <typename T>
 void hadamard_m(array& out, int n, int m, float scale) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
  auto end = matrix.find('\n', start);
  std::vector<bool> hmat_vec;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
    for (int i = 0; i < row.length(); i++) {
      hmat_vec.push_back(row[i] == '+');
    }
    start = end + 1;
    end = matrix.find('\n', start);
  }
  for (int b = 0; b < out.size() / m / n; b++) {
    size_t loc = b * n * m;
    T* data_ptr = out.data<T>() + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
      for (int j = 0; j < m; j++) {
        for (int k = 0; k < m; k++) {
          float x = *(data_ptr + i + k * n);
          if (hmat_vec[k + j * m]) {
            out[j] += x;
          } else {
            out[j] -= x;
          }
        }
      }
      for (int j = 0; j < m; j++) {
        *(data_ptr + i + j * n) = out[j] * scale;
      }
    }
  }
 }
 template <typename T>
 void hadamard(array& out, int n, int m, float scale) {
  float n_scale = m > 1 ? 1.0 : scale;
  hadamard_n<T>(out, n, m, n_scale);
  if (m > 1) {
    hadamard_m<T>(out, n, m, scale);
  }
 }
 void Hadamard::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  // Copy input to output
  copy(in, out, CopyType::General);
  int axis = out.ndim() - 1;
  auto [n, m] = decompose_hadamard(out.shape(axis));
  switch (in.dtype()) {
    case float32:
      return hadamard<float>(out, n, m, scale_);
    case float16:
      return hadamard<float16_t>(out, n, m, scale_);
    case bfloat16:
      return hadamard<bfloat16_t>(out, n, m, scale_);
    default:
      throw std::invalid_argument("[hadamard] Unsupported type.");
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -0,0 +1,105 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include <map>
 #include "mlx/utils.h"
 namespace mlx::core {
 // From http://neilsloane.com/hadamard/
 constexpr std::string_view h12 = R"(
 +-++++++++++
 --+-+-+-+-+-
 +++-++----++
 +---+--+-++-
 +++++-++----
 +-+---+--+-+
 ++--+++-++--
 +--++---+--+
 ++----+++-++
 +--+-++---+-
 ++++----+++-
 +-+--+-++---
 )";
 constexpr std::string_view h20 = R"(
 +----+----++--++-++-
 -+----+---+++---+-++
 --+----+---+++-+-+-+
 ---+----+---+++++-+-
 ----+----++--++-++-+
 -+++++-----+--+++--+
 +-+++-+---+-+--+++--
 ++-++--+---+-+--+++-
 +++-+---+---+-+--+++
 ++++-----++--+-+--++
 --++-+-++-+-----++++
 ---++-+-++-+---+-+++
 +---++-+-+--+--++-++
 ++---++-+----+-+++-+
 -++---++-+----+++++-
 -+--+--++-+----+----
 +-+-----++-+----+---
 -+-+-+---+--+----+--
 --+-+++------+----+-
 +--+--++------+----+
 )";
 constexpr std::string_view h28 = R"(
 +------++----++-+--+-+--++--
 -+-----+++-----+-+--+-+--++-
 --+-----+++---+-+-+----+--++
 ---+-----+++---+-+-+-+--+--+
 ----+-----+++---+-+-+++--+--
 -----+-----++++--+-+--++--+-
 ------++----++-+--+-+--++--+
 --++++-+-------++--+++-+--+-
 ---++++-+-----+-++--+-+-+--+
 +---+++--+----++-++--+-+-+--
 ++---++---+----++-++--+-+-+-
 +++---+----+----++-++--+-+-+
 ++++--------+-+--++-++--+-+-
 -++++--------+++--++--+--+-+
 -+-++-++--++--+--------++++-
 +-+-++--+--++--+--------++++
 -+-+-++--+--++--+----+---+++
 +-+-+-++--+--+---+---++---++
 ++-+-+-++--+------+--+++---+
 -++-+-+-++--+------+-++++---
 +-++-+---++--+------+-++++--
 -++--++-+-++-+++----++------
 +-++--++-+-++-+++-----+-----
 ++-++---+-+-++-+++-----+----
 -++-++-+-+-+-+--+++-----+---
 --++-++++-+-+----+++-----+--
 +--++-+-++-+-+----+++-----+-
 ++--++-+-++-+-+----++------+
 )";
 inline const std::map<int, std::string_view> hadamard_matrices() {
  return {{12, h12}, {20, h20}, {28, h28}};
 }
 inline std::pair<int, int> decompose_hadamard(int n) {
  // n = m*2^k
  int m = 1;
  if (!is_power_of_2(n)) {
    auto h_matrices = hadamard_matrices();
    for (auto [factor, _] : h_matrices) {
      if (n % factor == 0) {
        m = factor;
        n /= factor;
        break;
      }
    }
    if (m == 1) {
      throw std::invalid_argument(
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
  return {n, m};
 }
 } // namespace mlx::core
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -2,7 +2,6 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"
 #ifdef ACCELERATE_NEW_LAPACK
@@ -11,24 +10,39 @@
 #include <lapack.h>
 #endif
 // Wrapper to account for differences in
 // LAPACK implementations (basically how to pass the 'uplo' string to fortran).
 int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
  int info;
 #ifdef LAPACK_FORTRAN_STRLEN_END
  strtri_(
      /* uplo = */ &uplo,
      /* diag = */ &diag,
      /* N = */ &N,
      /* a = */ matrix,
      /* lda = */ &N,
      /* info = */ &info,
      /* uplo_len = */ static_cast<size_t>(1),
      /* diag_len = */ static_cast<size_t>(1));
 #else
  strtri_(
      /* uplo = */ &uplo,
      /* diag = */ &diag,
      /* N = */ &N,
      /* a = */ matrix,
      /* lda = */ &N,
      /* info = */ &info);
 #endif
  return info;
 }
 namespace mlx::core {
-void inverse_impl(const array& a, array& inv) {
+void general_inv(array& inv, int N, int i) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹
  // The inverse is computed in place, so just copy the input to the output.
  copy(a, inv, a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);
  int info;
  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
  for (int i = 0; i < num_matrices; i++) {
  // Compute LU factorization.
  sgetrf_(
      /* m = */ &N,
@@ -65,8 +79,7 @@ void inverse_impl(const array& a, array& inv) {
  }
  const int lwork = workspace_size;
-    auto scratch =
+  auto scratch = array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
        array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
  // Compute inverse.
  sgetri_(
@@ -84,21 +97,44 @@ void inverse_impl(const array& a, array& inv) {
    throw std::runtime_error(ss.str());
  }
 }
 void tri_inv(array& inv, int N, int i, bool upper) {
  const char uplo = upper ? 'L' : 'U';
  const char diag = 'N';
  int info = strtri_wrapper(uplo, diag, inv.data<float>() + N * N * i, N);
  if (info != 0) {
    std::stringstream ss;
    ss << "inverse_impl: triangular inversion failed with error code " << info;
    throw std::runtime_error(ss.str());
  }
 }
 void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹
  // The inverse is computed in place, so just copy the input to the output.
  copy(a, inv, a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);
  for (int i = 0; i < num_matrices; i++) {
    if (tri) {
      tri_inv(inv, N, i, upper);
    } else {
      general_inv(inv, N, i);
    }
  }
 }
 void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
-  inverse_impl(inputs[0], output);
+  inverse_impl(inputs[0], output, tri_, upper_);
 }
 std::pair<std::vector<array>, std::vector<int>> Inverse::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0] >= 0 ? 0 : -1;
  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  return {{linalg::inv(a, stream())}, {ax}};
 }
 } // namespace mlx::core
--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -11,7 +11,7 @@ GCC=$2
 SRCDIR=$3
 CLANG=$4
-if [ $CLANG = "TRUE" ]; then
+if [ "$CLANG" = "TRUE" ]; then
  read -r -d '' INCLUDES <<- EOM
  #include <cmath>
  #include <complex>
@@ -28,6 +28,7 @@ const char* get_kernel_preamble() {
 return R"preamble(
 $INCLUDES
 $CONTENT
 using namespace mlx::core;
 using namespace mlx::core::detail;
 )preamble";
 }
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -0,0 +1,305 @@
 // Copyright © 2024 Apple Inc.
 #ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
 #include <cblas.h>
 #endif
 #include <cstring>
 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"
 namespace mlx::core {
 namespace {
 template <typename T, typename mask_t>
 inline void mask_matrix(
    T* data,
    const mask_t* mask,
    int block_size,
    const int X,
    const int Y,
    const size_t X_data_str,
    const size_t Y_data_str,
    const size_t X_mask_str,
    const size_t Y_mask_str,
    const size_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;
  for (int i = 0; i < tX; i++) {
    for (int j = 0; j < tY; j++) {
      mask_t do_mask = mask[mask_offset + i * X_mask_str + j * Y_mask_str];
      if (do_mask != 1) {
        int loc_x = i * block_size;
        int loc_y = j * block_size;
        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
        int size_x = std::min(block_size, X - loc_x);
        int size_y = std::min(block_size, Y - loc_y);
        for (int ii = 0; ii < size_x; ii++) {
          for (int jj = 0; jj < size_y; jj++) {
            if constexpr (std::is_same_v<mask_t, bool>) {
              data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
            } else {
              data_block[ii * X_data_str + jj * Y_data_str] *= do_mask;
            }
          }
        }
      }
    }
  }
 }
 } // namespace
 void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[BlockMaskedMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  auto check_transpose =
      [](const array& arr, bool do_copy, bool expand_all = false) {
        auto stx = arr.strides()[arr.ndim() - 2];
        auto sty = arr.strides()[arr.ndim() - 1];
        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
            copy(arr, arr_copy, CopyType::Vector);
            return std::make_tuple(false, stx, arr_copy);
          }
          return std::make_tuple(false, stx, arr);
        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
            copy(arr, arr_copy, CopyType::Vector);
            return std::make_tuple(true, sty, arr_copy);
          }
          return std::make_tuple(true, sty, arr);
        } else {
          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
          copy(arr, arr_copy, CopyType::General);
          size_t stx = arr.shape(-1);
          return std::make_tuple(false, stx, arr_copy);
        }
      };
  bool has_op_mask = inputs.size() > 3;
  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;
  auto [a_transposed, lda, a] =
      check_transpose(a_pre, has_op_mask, inputs.back().dtype() != bool_);
  auto [b_transposed, ldb, b] =
      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }
  if (K == 0) {
    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
    return;
  }
  auto mask_array = [](const array& mask,
                       float* data,
                       int block_size,
                       int batch_idx,
                       int X,
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str) {
    size_t mask_offset = elem_to_loc(
        mask.shape(-1) * mask.shape(-2) * batch_idx,
        mask.shape(),
        mask.strides());
    size_t X_mask_str = mask.strides()[mask.ndim() - 2];
    size_t Y_mask_str = mask.strides()[mask.ndim() - 1];
    if (mask.dtype() == bool_) {
      return mask_matrix(
          data,
          mask.data<bool>(),
          block_size,
          X,
          Y,
          X_data_str,
          Y_data_str,
          X_mask_str,
          Y_mask_str,
          mask_offset);
    } else {
      return mask_matrix(
          data,
          mask.data<float>(),
          block_size,
          X,
          Y,
          X_data_str,
          Y_data_str,
          X_mask_str,
          Y_mask_str,
          mask_offset);
    }
  };
  for (int i = 0; i < (out.size() / (M * size_t(N))); ++i) {
    // Adjust pointer
    float* ai =
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides());
    float* bi =
        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides());
    float* ci = out.data<float>() + M * N * i;
    // Zero out blocks in a and b if needed
    if (has_op_mask) {
      auto& a_mask = inputs[inputs.size() - 2];
      mask_array(
          a_mask,
          ai,
          block_size_,
          i,
          M,
          K,
          a_transposed ? 1 : lda,
          a_transposed ? lda : 1);
      auto& b_mask = inputs[inputs.size() - 1];
      mask_array(
          b_mask,
          bi,
          block_size_,
          i,
          K,
          N,
          b_transposed ? 1 : ldb,
          b_transposed ? ldb : 1);
    }
    // Do matmul
    cblas_sgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        1.0, // alpha
        ai,
        lda,
        bi,
        ldb,
        0.0, // beta
        ci,
        out.shape(-1) // ldc
    );
    // Zero out blocks in out
    if (has_out_mask) {
      mask_array(inputs[2], ci, block_size_, i, M, N, N, 1);
    }
  }
 }
 void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[GatherMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  auto check_transpose = [](const array& arr) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
    if (stx == arr.shape(-1) && sty == 1) {
      return std::make_tuple(false, stx, arr);
    } else if (stx == 1 && sty == arr.shape(-2)) {
      return std::make_tuple(true, sty, arr);
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
      size_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, arr_copy);
    }
  };
  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }
  if (K == 0) {
    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
    return;
  }
  // Get batch dims
  auto batch_size_out = out.size() / (M * N);
  size_t matrix_stride_out = M * N;
  auto get_batch_dims = [](const auto& v) {
    return decltype(v){v.begin(), v.end() - 2};
  };
  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];
  std::vector<int> batch_shape = get_batch_dims(out.shape());
  int batch_ndim = batch_shape.size();
  std::vector<int> batch_shape_A = get_batch_dims(a.shape());
  std::vector<size_t> batch_strides_A = get_batch_dims(a.strides());
  std::vector<int> batch_shape_B = get_batch_dims(b.shape());
  std::vector<size_t> batch_strides_B = get_batch_dims(b.strides());
  const uint32_t* lhs_indices_ptr = lhs_indices.data<uint32_t>();
  const uint32_t* rhs_indices_ptr = rhs_indices.data<uint32_t>();
  for (int i = 0; i < batch_size_out; i++) {
    // Get index
    uint32_t indx_A = lhs_indices_ptr[elem_to_loc(i, lhs_indices)];
    uint32_t indx_B = rhs_indices_ptr[elem_to_loc(i, rhs_indices)];
    cblas_sgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        1.0f, // alpha
        a.data<float>() + elem_to_loc(indx_A, batch_shape_A, batch_strides_A),
        lda,
        b.data<float>() + elem_to_loc(indx_B, batch_shape_B, batch_strides_B),
        ldb,
        0.0f, // beta
        out.data<float>() + matrix_stride_out * i,
        out.shape(-1) // ldc
    );
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -108,133 +108,146 @@ struct Abs {
  template <typename T>
  T operator()(T x) {
    return std::abs(x);
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
-  };
+  }
 };
 struct ArcCos {
  template <typename T>
  T operator()(T x) {
    return std::acos(x);
-  };
+  }
 };
 struct ArcCosh {
  template <typename T>
  T operator()(T x) {
    return std::acosh(x);
-  };
+  }
 };
 struct ArcSin {
  template <typename T>
  T operator()(T x) {
    return std::asin(x);
-  };
+  }
 };
 struct ArcSinh {
  template <typename T>
  T operator()(T x) {
    return std::asinh(x);
-  };
+  }
 };
 struct ArcTan {
  template <typename T>
  T operator()(T x) {
    return std::atan(x);
  }
 };
 struct ArcTan2 {
  template <typename T>
  T operator()(T y, T x) {
    return std::atan2(y, x);
  }
 };
 struct ArcTanh {
  template <typename T>
  T operator()(T x) {
    return std::atanh(x);
-  };
+  }
 };
 struct Ceil {
  template <typename T>
  T operator()(T x) {
    return std::ceil(x);
-  };
+  }
  int8_t operator()(int8_t x) {
    return x;
-  };
+  }
  int16_t operator()(int16_t x) {
    return x;
-  };
+  }
  int32_t operator()(int32_t x) {
    return x;
-  };
+  }
  int64_t operator()(int64_t x) {
    return x;
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
  }
 };
 struct Conjugate {
  complex64_t operator()(complex64_t x) {
    return std::conj(x);
  }
 };
 struct Cos {
  template <typename T>
  T operator()(T x) {
    return std::cos(x);
-  };
+  }
 };
 struct Cosh {
  template <typename T>
  T operator()(T x) {
    return std::cosh(x);
-  };
+  }
 };
 struct Erf {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erf(static_cast<float>(x)));
-  };
+  }
 };
 struct ErfInv {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erfinv(static_cast<float>(x)));
-  };
+  }
 };
 struct Exp {
  template <typename T>
  T operator()(T x) {
    return fast_exp(x);
-  };
+  }
  complex64_t operator()(complex64_t x) {
    return std::exp(x);
@@ -245,83 +258,83 @@ struct Expm1 {
  template <typename T>
  T operator()(T x) {
    return expm1(x);
-  };
+  }
 };
 struct Floor {
  template <typename T>
  T operator()(T x) {
    return std::floor(x);
-  };
+  }
  int8_t operator()(int8_t x) {
    return x;
-  };
+  }
  int16_t operator()(int16_t x) {
    return x;
-  };
+  }
  int32_t operator()(int32_t x) {
    return x;
-  };
+  }
  int64_t operator()(int64_t x) {
    return x;
-  };
+  }
  uint8_t operator()(uint8_t x) {
    return x;
-  };
+  }
  uint16_t operator()(uint16_t x) {
    return x;
-  };
+  }
  uint32_t operator()(uint32_t x) {
    return x;
-  };
+  }
  uint64_t operator()(uint64_t x) {
    return x;
-  };
+  }
  bool operator()(bool x) {
    return x;
-  };
+  }
 };
 struct Log {
  template <typename T>
  T operator()(T x) {
    return std::log(x);
-  };
+  }
 };
 struct Log2 {
  template <typename T>
  T operator()(T x) {
    return std::log2(x);
-  };
+  }
 };
 struct Log10 {
  template <typename T>
  T operator()(T x) {
    return std::log10(x);
-  };
+  }
 };
 struct Log1p {
  template <typename T>
  T operator()(T x) {
    return log1p(x);
-  };
+  }
 };
 struct LogicalNot {
  template <typename T>
  T operator()(T x) {
    return !x;
-  };
+  }
 };
 struct Negative {
  template <typename T>
  T operator()(T x) {
    return -x;
-  };
+  }
 };
 struct Round {
@@ -366,49 +379,49 @@ struct Sin {
  template <typename T>
  T operator()(T x) {
    return std::sin(x);
-  };
+  }
 };
 struct Sinh {
  template <typename T>
  T operator()(T x) {
    return std::sinh(x);
-  };
+  }
 };
 struct Square {
  template <typename T>
  T operator()(T x) {
    return x * x;
-  };
+  }
 };
 struct Sqrt {
  template <typename T>
  T operator()(T x) {
    return std::sqrt(x);
-  };
+  }
 };
 struct Rsqrt {
  template <typename T>
  T operator()(T x) {
    return static_cast<decltype(x)>(1.0) / std::sqrt(x);
-  };
+  }
 };
 struct Tan {
  template <typename T>
  T operator()(T x) {
    return std::tan(x);
-  };
+  }
 };
 struct Tanh {
  template <typename T>
  T operator()(T x) {
    return std::tanh(x);
-  };
+  }
 };
 struct Add {
@@ -541,7 +554,7 @@ struct LogAddExp {
        ? maxval
        : static_cast<decltype(x)>(
              maxval + std::log1p(fast_exp(minval - maxval)));
-  };
+  }
 };
 struct Multiply {
@@ -589,14 +602,14 @@ struct LogicalAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x && y;
-  };
+  }
 };
 struct LogicalOr {
  template <typename T>
  T operator()(T x, T y) {
    return x || y;
-  };
+  }
 };
 struct Select {
@@ -606,4 +619,39 @@ struct Select {
  }
 };
 struct BitwiseAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x & y;
  }
 };
 struct BitwiseOr {
  template <typename T>
  T operator()(T x, T y) {
    return x | y;
  }
 };
 struct BitwiseXor {
  template <typename T>
  T operator()(T x, T y) {
    return x ^ y;
  }
 };
 struct LeftShift {
  template <typename T>
  T operator()(T x, T y) {
    return x << y;
  }
 };
 struct RightShift {
  template <typename T>
  T operator()(T x, T y) {
    return x >> y;
  }
 };
 } // namespace mlx::core::detail
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.
 #include <algorithm>
 #include <cassert>
@@ -8,9 +8,9 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/arange.h"
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/ops.h"
 #include "mlx/backend/common/slicing.h"
 #include "mlx/backend/common/threefry.h"
 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/common/utils.h"
@@ -113,61 +113,6 @@ void AsType::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, ctype);
 }
 void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (!in.flags().row_contiguous) {
    // Just ensuring that inputs[0] came from the ops which would ensure the
    // input is row contiguous.
    throw std::runtime_error(
        "AsStrided must be used with row contiguous arrays only.");
  }
  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
  size_t r = 1, c = 1;
  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
    c *= shape_[j];
  }
  auto flags = in.flags();
  // TODO: Compute the contiguous flag in a better way cause now we are
  //       unnecessarily strict.
  flags.contiguous = row_contiguous || col_contiguous;
  flags.row_contiguous = row_contiguous;
  flags.col_contiguous = col_contiguous;
  // There is no easy way to compute the actual data size so we use out.size().
  // The contiguous flag will almost certainly not be set so no code should
  // rely on data_size anyway.
  size_t data_size = out.size();
  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
 }
 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
  }
  std::vector<size_t> strides(out.ndim(), 0);
  int diff = out.ndim() - in.ndim();
  for (int i = in.ndim() - 1; i >= 0; --i) {
    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
  }
  auto flags = in.flags();
  if (out.size() > in.size()) {
    flags.row_contiguous = flags.col_contiguous = false;
  }
  out.copy_shared_buffer(in, strides, flags, in.data_size());
 }
 void Ceil::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -203,9 +148,15 @@ void Concatenate::eval(const std::vector<array>& inputs, array& out) {
  }
 }
-void Copy::eval(const std::vector<array>& inputs, array& out) {
+void Conjugate::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
+  const auto& in = inputs[0];
  if (out.dtype() == complex64) {
    unary_fp(in, out, detail::Conjugate());
  } else {
    throw std::invalid_argument(
        "[conjugate] conjugate must be called on complex input.");
  }
 }
 void Cos::eval(const std::vector<array>& inputs, array& out) {
@@ -232,81 +183,6 @@ void Cosh::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 void CustomVJP::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
 }
 void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0; i < outputs.size(); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
 }
 void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
  double numel = 1;
  for (auto ax : axes_) {
    numel *= inputs[0].shape(ax);
  }
  if (inverted_) {
    numel = 1.0 / numel;
  }
  switch (out.dtype()) {
    case bool_:
      *out.data<bool>() = static_cast<bool>(numel);
      break;
    case uint8:
      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
      break;
    case uint16:
      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
      break;
    case uint32:
      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
      break;
    case uint64:
      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
      break;
    case int8:
      *out.data<int8_t>() = static_cast<int8_t>(numel);
      break;
    case int16:
      *out.data<int16_t>() = static_cast<int16_t>(numel);
      break;
    case int32:
      *out.data<int32_t>() = static_cast<int32_t>(numel);
      break;
    case int64:
      *out.data<int64_t>() = static_cast<int64_t>(numel);
      break;
    case float16:
      *out.data<float16_t>() = static_cast<float16_t>(numel);
      break;
    case float32:
      *out.data<float>() = static_cast<float>(numel);
      break;
    case bfloat16:
      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
      break;
    case complex64:
      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
      break;
  }
 }
 void Erf::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -437,20 +313,6 @@ void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
  unary(in, out, detail::LogicalNot());
 }
 void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalAnd());
 }
 void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalOr());
 }
 void Negative::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -536,63 +398,6 @@ void RandomBits::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
    const array& in,
    const array& out) {
  // Special case for empty arrays or row contiguous arrays
  if (in.size() == 0 || in.flags().row_contiguous) {
    return {false, out.strides()};
  }
  // Special case for scalars
  if (in.ndim() == 0) {
    std::vector<size_t> out_strides(out.ndim(), 0);
    return {false, out_strides};
  }
  // Firstly let's collapse all the contiguous dimensions of the input
  auto [shape, _strides] = collapse_contiguous_dims(in);
  auto& strides = _strides[0];
  // If shapes fit exactly in the contiguous dims then no copy is necessary so
  // let's check.
  std::vector<size_t> out_strides;
  bool copy_necessary = false;
  int j = 0;
  for (int i = 0; i < out.ndim(); i++) {
    int N = out.shape(i);
    if (j < shape.size() && shape[j] % N == 0) {
      shape[j] /= N;
      out_strides.push_back(shape[j] * strides[j]);
      j += (shape[j] == 1);
    } else if (N == 1) {
      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
      out_strides.push_back(out_strides.back());
    } else {
      copy_necessary = true;
      break;
    }
  }
  return {copy_necessary, out_strides};
 }
 void Reshape::shared_buffer_reshape(
    const array& in,
    const std::vector<size_t>& out_strides,
    array& out) {
  auto flags = in.flags();
  if (flags.row_contiguous) {
    // For row contiguous reshapes:
    // - Shallow copy the buffer
    // - If reshaping into a vector (all singleton dimensions except one) it
    //    becomes col contiguous again.
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
  }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }
 void Reshape::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -600,7 +405,17 @@ void Reshape::eval(const std::vector<array>& inputs, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
-    copy(in, out, in.data_size() == 1 ? CopyType::Scalar : CopyType::General);
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
    auto out_strides = make_contiguous_strides<size_t>(in.shape());
    copy_inplace<size_t>(
        in,
        out,
        in.shape(),
        in.strides(),
        out_strides,
        0,
        0,
        CopyType::General);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -663,49 +478,6 @@ void Sinh::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 std::tuple<bool, int64_t, std::vector<int64_t>> Slice::prepare_slice(
    const array& in) {
  int64_t data_offset = 0;
  bool copy_needed = false;
  std::vector<int64_t> inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices_[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides_[i];
    copy_needed |= strides_[i] < 0;
  }
  return std::make_tuple(copy_needed, data_offset, inp_strides);
 }
 void Slice::shared_buffer_slice(
    const array& in,
    const std::vector<size_t>& out_strides,
    size_t data_offset,
    array& out) {
  // Compute row/col contiguity
  auto [data_size, is_row_contiguous, is_col_contiguous] =
      check_contiguity(out.shape(), out_strides);
  auto flags = in.flags();
  flags.row_contiguous = is_row_contiguous;
  flags.col_contiguous = is_col_contiguous;
  if (data_size == 1) {
    // Broadcasted scalar array is contiguous.
    flags.contiguous = true;
  } else if (data_size == in.data_size()) {
    // Means we sliced a broadcasted dimension so leave the "no holes" flag
    // alone.
  } else {
    // We sliced something. So either we are row or col contiguous or we
    // punched a hole.
    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
  }
  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
 }
 void Slice::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  if (out.size() == 0) {
@@ -716,7 +488,8 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] = prepare_slice(in);
+  auto [copy_needed, data_offset, inp_strides] =
      prepare_slice(in, start_indices_, strides_);
  // Do copy if needed
  if (copy_needed) {
@@ -737,18 +510,6 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
    const array& in) {
  int64_t data_offset = 0;
  std::vector<int64_t> inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices_[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides_[i];
  }
  return std::make_tuple(data_offset, inp_strides);
 }
 void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
@@ -786,58 +547,6 @@ void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
      /* CopyType ctype = */ CopyType::GeneralGeneral);
 }
 void Split::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
                              size_t in_data_size,
                              auto flags) {
    size_t data_size = 1;
    size_t f_stride = 1;
    size_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
      f_stride *= shape[i];
      b_stride *= shape[ri];
      if (strides[i] > 0) {
        data_size *= shape[i];
      }
    }
    if (data_size == 1) {
      // Broadcasted scalar array is contiguous.
      flags.contiguous = true;
    } else if (data_size == in_data_size) {
      // Means we sliced a broadcasted dimension so leave the "no holes" flag
      // alone.
    } else {
      // We sliced something. So either we are row or col contiguous or we
      // punched a hole.
      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
    }
    return std::pair<decltype(flags), size_t>{flags, data_size};
  };
  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
  for (int i = 0; i < indices.size(); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
    outputs[i].copy_shared_buffer(
        in, in.strides(), new_flags, data_size, offset);
  }
 }
 void Square::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -854,11 +563,6 @@ void Sqrt::eval(const std::vector<array>& inputs, array& out) {
  }
 }
 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.copy_shared_buffer(inputs[0]);
 }
 void Tan::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -883,38 +587,36 @@ void Tanh::eval(const std::vector<array>& inputs, array& out) {
  }
 }
-void Transpose::eval(const std::vector<array>& inputs, array& out) {
+void View::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  std::vector<size_t> out_strides(out.ndim());
  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
+  auto ibytes = size_of(in.dtype());
-    out_strides[ax] = in.strides()[axes_[ax]];
+  auto obytes = size_of(out.dtype());
  // Conditions for buffer copying (disjunction):
  // - type size is the same
  // - type size is smaller and the last axis is contiguous
  // - the entire array is row contiguous
  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
      in.flags().row_contiguous) {
    auto strides = in.strides();
    for (int i = 0; i < strides.size() - 1; ++i) {
      strides[i] *= ibytes;
      strides[i] /= obytes;
    }
    out.copy_shared_buffer(
        in, strides, in.flags(), in.data_size() * obytes / ibytes);
  } else {
    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
    copy_inplace(in, tmp, CopyType::General);
-  // Conditions for {row/col}_contiguous
+    auto flags = out.flags();
-  // - array must be contiguous (no gaps)
+    flags.contiguous = true;
  // - underlying buffer size should have the same size as the array
  // - cumulative product of shapes is equal to the strides (we can ignore axes
  //   with size == 1)
  //   - in the forward direction (column contiguous)
  //   - in the reverse direction (row contiguous)
  // - vectors are both row and col contiguous (hence if both row/col are
  //   true, they stay true)
  auto flags = in.flags();
  if (flags.contiguous && in.data_size() == in.size()) {
    size_t f_stride = 1;
    size_t b_stride = 1;
    flags.col_contiguous = true;
    flags.row_contiguous = true;
-    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
-      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
-      f_stride *= out.shape(i);
+    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
      flags.row_contiguous &=
          (out_strides[ri] == b_stride || out.shape(ri) == 1);
      b_stride *= out.shape(ri);
  }
 }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }
 } // namespace mlx::core
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -192,7 +192,7 @@ void _qmm_dispatch_typed(
 }
 void _qmm_dispatch(
-    array out,
+    array& out,
    const array& x,
    const array& w,
    const array& scales,
@@ -253,6 +253,81 @@ void _qmm_dispatch(
  }
 }
 void _bs_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    int bits,
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.shape(-2);
  int N = out.shape(-1);
  int w_els = w.shape(-1) * w.shape(-2);
  int g_els = scales.shape(-1) * scales.shape(-2);
  const uint32_t* lhs_indices_data = lhs_indices.data<uint32_t>();
  const uint32_t* rhs_indices_data = rhs_indices.data<uint32_t>();
  for (int i = 0; i < lhs_indices.size(); i++) {
    int x_idx = lhs_indices_data[elem_to_loc(i, lhs_indices)];
    int w_idx = rhs_indices_data[elem_to_loc(i, rhs_indices)];
    switch (x.dtype()) {
      case float32:
        _qmm_dispatch_typed<float>(
            out.data<float>() + i * M * N,
            x.data<float>() + elem_to_loc(x_idx * M * K, x),
            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
            scales.data<float>() + elem_to_loc(w_idx * g_els, scales),
            biases.data<float>() + elem_to_loc(w_idx * g_els, biases),
            M,
            N,
            K,
            bits,
            group_size,
            transposed_w);
        break;
      case float16:
        _qmm_dispatch_typed<float16_t>(
            out.data<float16_t>() + i * M * N,
            x.data<float16_t>() + elem_to_loc(x_idx * M * K, x),
            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
            scales.data<float16_t>() + elem_to_loc(w_idx * g_els, scales),
            biases.data<float16_t>() + elem_to_loc(w_idx * g_els, biases),
            M,
            N,
            K,
            bits,
            group_size,
            transposed_w);
        break;
      case bfloat16:
        _qmm_dispatch_typed<bfloat16_t>(
            out.data<bfloat16_t>() + i * M * N,
            x.data<bfloat16_t>() + elem_to_loc(x_idx * M * K, x),
            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
            scales.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, scales),
            biases.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, biases),
            M,
            N,
            K,
            bits,
            group_size,
            transposed_w);
        break;
      default:
        throw std::invalid_argument(
            "[quantized_matmul] only floating types are supported");
    }
  }
 }
 } // namespace
 void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
@@ -282,4 +357,45 @@ void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }
 void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 6);
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
  auto& biases_pre = inputs[3];
  auto& lhs_indices = inputs[4];
  auto& rhs_indices = inputs[5];
  auto ensure_row_contiguous_last_dims = [](const array& arr) {
    auto stride_0 = arr.strides()[arr.ndim() - 2];
    auto stride_1 = arr.strides()[arr.ndim() - 1];
    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
      return arr;
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
      return arr_copy;
    }
  };
  auto x = ensure_row_contiguous_last_dims(x_pre);
  auto w = ensure_row_contiguous_last_dims(w_pre);
  auto scales = ensure_row_contiguous_last_dims(scales_pre);
  auto biases = ensure_row_contiguous_last_dims(biases_pre);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
  _bs_qmm_dispatch(
      out,
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      group_size_,
      bits_,
      transpose_);
 }
 } // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -104,48 +104,14 @@ void reduce_dispatch_out(
    }
    case Reduce::Sum: {
      auto op = [](auto y, auto x) { (*y) = (*y) + x; };
-      switch (out.dtype()) {
+      if (out.dtype() == int32) {
-        case bool_:
+        // special case since the input type can be bool
          reduction_op<InT, bool>(in, out, axes, false, op);
          break;
        case uint8:
          reduction_op<InT, uint8_t>(in, out, axes, 0, op);
          break;
        case uint16:
          reduction_op<InT, uint16_t>(in, out, axes, 0, op);
          break;
        case uint32:
          reduction_op<InT, uint32_t>(in, out, axes, 0, op);
          break;
        case uint64:
          reduction_op<InT, uint64_t>(in, out, axes, 0, op);
          break;
        case int8:
          reduction_op<InT, int8_t>(in, out, axes, 0, op);
          break;
        case int16:
          reduction_op<InT, int16_t>(in, out, axes, 0, op);
          break;
        case int32:
        reduction_op<InT, int32_t>(in, out, axes, 0, op);
-          break;
+      } else {
-        case int64:
+        reduction_op<InT, InT>(in, out, axes, 0, op);
-          reduction_op<InT, int64_t>(in, out, axes, 0, op);
+      }
          break;
        case float16:
          reduction_op<InT, float16_t>(in, out, axes, 0.0f, op);
          break;
        case float32:
          reduction_op<InT, float>(in, out, axes, 0.0f, op);
          break;
        case bfloat16:
          reduction_op<InT, bfloat16_t>(in, out, axes, 0.0f, op);
          break;
        case complex64:
          reduction_op<InT, complex64_t>(in, out, axes, complex64_t{0.0f}, op);
      break;
    }
    } break;
    case Reduce::Prod: {
      auto op = [](auto y, auto x) { (*y) *= x; };
      reduction_op<InT, InT>(in, out, axes, 1, op);
@@ -168,6 +134,29 @@ void reduce_dispatch_out(
 } // namespace
 void nd_loop(
    std::function<void(int)> callback,
    const std::vector<int>& shape,
    const std::vector<size_t>& strides) {
  std::function<void(int, int)> loop_inner;
  loop_inner = [&](int dim, int offset) {
    if (dim < shape.size() - 1) {
      int size = shape[dim];
      size_t stride = strides[dim];
      for (int i = 0; i < size; i++) {
        loop_inner(dim + 1, offset + i * stride);
      }
    } else {
      int size = shape[dim];
      size_t stride = strides[dim];
      for (int i = 0; i < size; i++) {
        callback(offset + i * stride);
      }
    }
  };
  loop_inner(0, 0);
 }
 void Reduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -49,47 +49,18 @@ struct ReductionPlan {
  ReductionPlan(ReductionOpType type_) : type(type_) {}
 };
-namespace {
+ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes);
 // Helper for the ndimensional strided loop
 // Should this be in utils?
-inline void nd_loop(
+void nd_loop(
    std::function<void(int)> callback,
    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
+    const std::vector<size_t>& strides);
  std::function<void(int, int)> loop_inner;
  loop_inner = [&](int dim, int offset) {
    if (dim < shape.size() - 1) {
      int size = shape[dim];
      size_t stride = strides[dim];
      for (int i = 0; i < size; i++) {
        loop_inner(dim + 1, offset + i * stride);
      }
    } else {
      int size = shape[dim];
      size_t stride = strides[dim];
      for (int i = 0; i < size; i++) {
        callback(offset + i * stride);
      }
    }
  };
  loop_inner(0, 0);
 }
 std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
    const array& x,
-    const std::vector<int>& axes) {
+    const std::vector<int>& axes);
  std::vector<int> shape = x.shape();
  std::vector<size_t> strides = x.strides();
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
    strides.erase(strides.begin() + a);
  }
  return std::make_pair(shape, strides);
 }
 template <typename T, typename U, typename Op>
 struct DefaultStridedReduce {
@@ -123,102 +94,6 @@ struct DefaultContiguousReduce {
  }
 };
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }
  // Row contiguous input so the output is row contiguous
  if (x.flags().row_contiguous) {
    // Merge consecutive axes
    std::vector<int> shape = {x.shape(axes[0])};
    std::vector<size_t> strides = {x.strides()[axes[0]]};
    for (int i = 1; i < axes.size(); i++) {
      if (axes[i] - 1 == axes[i - 1]) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
      } else {
        shape.push_back(x.shape(axes[i]));
        strides.push_back(x.strides()[axes[i]]);
      }
    }
    if (strides.back() == 1) {
      return ReductionPlan(ContiguousReduce, shape, strides);
    } else if (strides.back() > 1) {
      return ReductionPlan(ContiguousStridedReduce, shape, strides);
    }
  }
  // Let's check if we can optimize our access patterns
  //
  // 1. We have a reduction axis with stride 1. Simply call
  //    GeneralContiguousReduce and be done with it.
  // 2. We have transpositions and we are not reducing over the axis with
  //    stride 1. However, we are reducing over an axis where everything is
  //    contiguous in memory to the right of that axis. We can call strided
  //    reduce and be done with it.
  // 2. We have weird transpositions and expands. Copy the strides to the
  //    output, then call strided reduce.
  // Sort reduction axes by stride in order to merge them and figure out if we
  // have a contiguous reduction.
  std::vector<std::pair<int, size_t>> reductions;
  for (auto a : axes) {
    reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
  }
  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
    return a.second > b.second;
  });
  // Extract the two smallest and try to merge them in case the contiguous
  // reduction can be bigger than just the last axis.
  for (int i = reductions.size() - 1; i >= 1; i--) {
    auto a = reductions[i];
    auto b = reductions[i - 1];
    // b.stride = a.shape * a.stride then a and b are contiguous
    if (b.second == a.first * a.second) {
      reductions.erase(reductions.begin() + i);
      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
    }
  }
  std::vector<int> shape;
  std::vector<size_t> strides;
  for (auto r : reductions) {
    shape.push_back(r.first);
    strides.push_back(r.second);
  }
  // We can call the contiguous reduction op for every weird way the input is
  // structured in the rest of the axes.
  if (strides.back() == 1) {
    return ReductionPlan(GeneralContiguousReduce, shape, strides);
  }
  // Delegate to the general strided reduction op if the axes after
  // strides.back() are contiguous.
  if (strides.back() > 1) {
    int size = 1;
    for (int i = x.ndim() - 1; i >= 0; i--) {
      if (axes.back() == i) {
        continue;
      }
      if (x.strides()[i] != size) {
        break;
      }
      size *= x.shape(i);
    }
    if (size >= strides.back()) {
      return ReductionPlan(GeneralStridedReduce, shape, strides);
    }
  }
  return ReductionPlan(GeneralReduce, shape, strides);
 }
 template <typename T, typename U, typename OpS, typename OpC, typename Op>
 void reduction_op(
    const array& x,
@@ -361,6 +236,4 @@ void reduction_op(
  reduction_op<T, U>(x, out, axes, init, ops, opc, op);
 }
 } // namespace
 } // namespace mlx::core
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -0,0 +1,118 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/backend/common/reduce.h"
 namespace mlx::core {
 std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes) {
  std::vector<int> shape = x.shape();
  std::vector<size_t> strides = x.strides();
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
    strides.erase(strides.begin() + a);
  }
  return std::make_pair(shape, strides);
 }
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }
  // Row contiguous input so the output is row contiguous
  if (x.flags().row_contiguous) {
    // Merge consecutive axes
    std::vector<int> shape = {x.shape(axes[0])};
    std::vector<size_t> strides = {x.strides()[axes[0]]};
    for (int i = 1; i < axes.size(); i++) {
      if (axes[i] - 1 == axes[i - 1]) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
      } else {
        shape.push_back(x.shape(axes[i]));
        strides.push_back(x.strides()[axes[i]]);
      }
    }
    if (strides.back() == 1) {
      return ReductionPlan(ContiguousReduce, shape, strides);
    } else if (strides.back() > 1) {
      return ReductionPlan(ContiguousStridedReduce, shape, strides);
    }
  }
  // Let's check if we can optimize our access patterns
  //
  // 1. We have a reduction axis with stride 1. Simply call
  //    GeneralContiguousReduce and be done with it.
  // 2. We have transpositions and we are not reducing over the axis with
  //    stride 1. However, we are reducing over an axis where everything is
  //    contiguous in memory to the right of that axis. We can call strided
  //    reduce and be done with it.
  // 2. We have weird transpositions and expands. Copy the strides to the
  //    output, then call strided reduce.
  // Sort reduction axes by stride in order to merge them and figure out if we
  // have a contiguous reduction.
  std::vector<std::pair<int, size_t>> reductions;
  for (auto a : axes) {
    reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
  }
  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
    return a.second > b.second;
  });
  // Extract the two smallest and try to merge them in case the contiguous
  // reduction can be bigger than just the last axis.
  for (int i = reductions.size() - 1; i >= 1; i--) {
    auto a = reductions[i];
    auto b = reductions[i - 1];
    // b.stride = a.shape * a.stride then a and b are contiguous
    if (b.second == a.first * a.second) {
      reductions.erase(reductions.begin() + i);
      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
    }
  }
  std::vector<int> shape;
  std::vector<size_t> strides;
  for (auto r : reductions) {
    shape.push_back(r.first);
    strides.push_back(r.second);
  }
  // We can call the contiguous reduction op for every weird way the input is
  // structured in the rest of the axes.
  if (strides.back() == 1) {
    return ReductionPlan(GeneralContiguousReduce, shape, strides);
  }
  // Delegate to the general strided reduction op if the axes after
  // strides.back() are contiguous.
  if (strides.back() > 1) {
    int size = 1;
    for (int i = x.ndim() - 1; i >= 0; i--) {
      if (axes.back() == i) {
        continue;
      }
      if (x.strides()[i] != size) {
        break;
      }
      size *= x.shape(i);
    }
    if (size >= strides.back()) {
      return ReductionPlan(GeneralStridedReduce, shape, strides);
    }
  }
  return ReductionPlan(GeneralReduce, shape, strides);
 }
 } // namespace mlx::core
--- a/mlx/backend/common/scan.cpp
+++ b/mlx/backend/common/scan.cpp
@@ -234,7 +234,7 @@ void scan_dispatch(
      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *y : *x; };
      auto init = (issubdtype(input.dtype(), floating))
          ? static_cast<U>(-std::numeric_limits<float>::infinity())
-          : std::numeric_limits<U>::max();
+          : std::numeric_limits<U>::min();
      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -0,0 +1,52 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/backend/common/utils.h"
 namespace mlx::core {
 std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
    const array& in,
    std::vector<int>& start_indices,
    std::vector<int>& strides) {
  int64_t data_offset = 0;
  bool copy_needed = false;
  std::vector<int64_t> inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
    copy_needed |= strides[i] < 0;
  }
  return std::make_tuple(copy_needed, data_offset, inp_strides);
 }
 void shared_buffer_slice(
    const array& in,
    const std::vector<size_t>& out_strides,
    size_t data_offset,
    array& out) {
  // Compute row/col contiguity
  auto [data_size, is_row_contiguous, is_col_contiguous] =
      check_contiguity(out.shape(), out_strides);
  auto flags = in.flags();
  flags.row_contiguous = is_row_contiguous;
  flags.col_contiguous = is_col_contiguous;
  if (data_size == 1) {
    // Broadcasted scalar array is contiguous.
    flags.contiguous = true;
  } else if (data_size == in.data_size()) {
    // Means we sliced a broadcasted dimension so leave the "no holes" flag
    // alone.
  } else {
    // We sliced something. So either we are row or col contiguous or we
    // punched a hole.
    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
  }
  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
 }
 } // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -0,0 +1,20 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include "mlx/array.h"
 namespace mlx::core {
 std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
    const array& in,
    std::vector<int>& start_indices,
    std::vector<int>& strides);
 void shared_buffer_slice(
    const array& in,
    const std::vector<size_t>& out_strides,
    size_t data_offset,
    array& out);
 } // namespace mlx::core
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -113,14 +113,14 @@ void sort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);
-  auto remaining_shape = in.shape();
+  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
-  auto remaining_strides = in.strides();
+  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);
-  size_t axis_stride = in.strides()[axis];
+  size_t axis_stride = out.strides()[axis];
-  int axis_size = in.shape(axis);
+  int axis_size = out.shape(axis);
  // Perform sorting in place
  for (int i = 0; i < n_rows; i++) {
@@ -143,34 +143,42 @@ void argsort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);
-  auto remaining_shape = in.shape();
+  auto in_remaining_shape = in.shape();
-  remaining_shape.erase(remaining_shape.begin() + axis);
+  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
-  auto remaining_strides = in.strides();
+  auto in_remaining_strides = in.strides();
-  remaining_strides.erase(remaining_strides.begin() + axis);
+  in_remaining_strides.erase(in_remaining_strides.begin() + axis);
-  size_t axis_stride = in.strides()[axis];
+  auto out_remaining_shape = out.shape();
  out_remaining_shape.erase(out_remaining_shape.begin() + axis);
  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);
  size_t in_stride = in.strides()[axis];
  size_t out_stride = out.strides()[axis];
  int axis_size = in.shape(axis);
  // Perform sorting
  for (int i = 0; i < n_rows; i++) {
-    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    size_t in_loc = elem_to_loc(i, in_remaining_shape, in_remaining_strides);
-    const T* data_ptr = in.data<T>() + loc;
+    size_t out_loc = elem_to_loc(i, out_remaining_shape, out_remaining_strides);
-    IdxT* idx_ptr = out.data<IdxT>() + loc;
+    const T* data_ptr = in.data<T>() + in_loc;
    IdxT* idx_ptr = out.data<IdxT>() + out_loc;
-    StridedIterator st_(idx_ptr, axis_stride, 0);
+    StridedIterator st_(idx_ptr, out_stride, 0);
-    StridedIterator ed_(idx_ptr, axis_stride, axis_size);
+    StridedIterator ed_(idx_ptr, out_stride, axis_size);
    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));
    // Sort according to vals
-    StridedIterator st(idx_ptr, axis_stride, 0);
+    StridedIterator st(idx_ptr, out_stride, 0);
-    StridedIterator ed(idx_ptr, axis_stride, axis_size);
+    StridedIterator ed(idx_ptr, out_stride, axis_size);
-    std::stable_sort(st, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
+    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
-      auto v1 = data_ptr[a * axis_stride];
+      auto v1 = data_ptr[a * in_stride];
-      auto v2 = data_ptr[b * axis_stride];
+      auto v2 = data_ptr[b * in_stride];
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -3,7 +3,6 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/lapack_helper.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"
 namespace mlx::core {
@@ -145,12 +144,4 @@ void SVD::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  svd_impl(inputs[0], outputs[0], outputs[1], outputs[2]);
 }
 std::pair<std::vector<array>, std::vector<int>> SVD::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0] >= 0 ? 0 : -1;
  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  return {{linalg::svd(a, stream())}, {ax, ax, ax}};
 }
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -29,6 +29,15 @@ inline size_t elem_to_loc(int elem, const array& a) {
  return elem_to_loc(elem, a.shape(), a.strides());
 }
 template <typename stride_t>
 std::vector<stride_t> make_contiguous_strides(const std::vector<int>& shape) {
  std::vector<stride_t> strides(shape.size(), 1);
  for (int i = shape.size() - 1; i > 0; i--) {
    strides[i - 1] = strides[i] * shape[i];
  }
  return strides;
 }
 // Collapse dims that are contiguous to possibly route to a better kernel
 // e.g. for x = transpose(array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2}), {2, 0, 1})
 // should return {{2, 4}, {{1, 2}}}.
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -1,32 +1,140 @@
 function(make_jit_source SRC_FILE)
  # This function takes a metal header file,
  # runs the C preprocessesor on it, and makes
  # the processed contents available as a string in a C++ function
  # mlx::core::metal::${SRC_NAME}()
  #
  # To use the function, declare it in jit/includes.h and
  # include jit/includes.h.
  #
  # Additional arguments to this function are treated as dependencies
  # in the Cmake build system.
  get_filename_component(SRC_NAME ${SRC_FILE} NAME)
  add_custom_command(
-    OUTPUT  compiled_preamble.cpp
+    OUTPUT  jit/${SRC_NAME}.cpp
    COMMAND /bin/bash
              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
-              ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
+              ${CMAKE_CURRENT_BINARY_DIR}/jit
              ${CMAKE_C_COMPILER}
              ${PROJECT_SOURCE_DIR}
              ${SRC_FILE}
              "-DMLX_METAL_VERSION=${MLX_METAL_VERSION}"
    DEPENDS make_compiled_preamble.sh
-            kernels/compiled_preamble.h
+            kernels/${SRC_FILE}.h
-            kernels/unary.h
+            ${ARGN}
            kernels/binary.h
  )
-
+  add_custom_target(${SRC_NAME} DEPENDS jit/${SRC_NAME}.cpp)
-add_custom_target(
+  add_dependencies(mlx ${SRC_NAME})
-  compiled_preamble
+  target_sources(
-  DEPENDS compiled_preamble.cpp
+    mlx
    PRIVATE
    ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp
  )
 endfunction(make_jit_source)
-add_dependencies(mlx compiled_preamble)
+make_jit_source(
  utils
  kernels/bf16.h
  kernels/complex.h
  kernels/defines.h
 )
 make_jit_source(
  unary_ops
  kernels/erf.h
  kernels/expm1f.h
 )
 make_jit_source(binary_ops)
 make_jit_source(ternary_ops)
 make_jit_source(
  reduce_utils
  kernels/atomic.h
  kernels/reduction/ops.h
 )
 make_jit_source(scatter)
 make_jit_source(gather)
 make_jit_source(hadamard)
 if (MLX_METAL_JIT) 
  target_sources(
    mlx
    PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp
  )
  make_jit_source(arange)
  make_jit_source(copy)
  make_jit_source(unary)
  make_jit_source(binary)
  make_jit_source(binary_two)
  make_jit_source(
    fft
    kernels/fft/radix.h
    kernels/fft/readwrite.h
  )
  make_jit_source(ternary)
  make_jit_source(softmax)
  make_jit_source(scan)
  make_jit_source(sort)
  make_jit_source(
    reduce
    kernels/reduction/reduce_all.h
    kernels/reduction/reduce_col.h
    kernels/reduction/reduce_row.h
  )
  make_jit_source(
    steel/gemm/gemm
    kernels/steel/utils.h
    kernels/steel/gemm/loader.h
    kernels/steel/gemm/mma.h
    kernels/steel/gemm/params.h
    kernels/steel/gemm/transforms.h
  )
  make_jit_source(steel/gemm/kernels/steel_gemm_fused)
  make_jit_source(
    steel/gemm/kernels/steel_gemm_masked
    kernels/steel/defines.h
  )
  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
  make_jit_source(
    steel/conv/conv
    kernels/steel/utils.h
    kernels/steel/defines.h
    kernels/steel/gemm/mma.h
    kernels/steel/gemm/transforms.h
    kernels/steel/conv/params.h
    kernels/steel/conv/loader.h
    kernels/steel/conv/loaders/loader_channel_l.h
    kernels/steel/conv/loaders/loader_channel_n.h
  )
  make_jit_source(
    steel/conv/kernels/steel_conv
  )
  make_jit_source(
    steel/conv/kernels/steel_conv_general
    kernels/steel/defines.h
    kernels/steel/conv/loaders/loader_general.h
  )
  make_jit_source(quantized)
  make_jit_source(gemv_masked)
 else()
  target_sources(
    mlx
    PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp
  )
 endif()
 target_sources(
  mlx
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
@@ -36,10 +144,13 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
 )
 if (NOT MLX_METAL_PATH)
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -140,10 +140,15 @@ void BufferCache::remove_from_list(BufferCache::BufferHolder* to_remove) {
 MetalAllocator::MetalAllocator()
    : device_(device(mlx::core::Device::gpu).mtl_device()),
-      buffer_cache_(device_),
+      buffer_cache_(device_) {
-      block_limit_(1.5 * device_->recommendedMaxWorkingSetSize()),
+  auto memsize = std::get<size_t>(device_info()["memory_size"]);
-      gc_limit_(0.95 * device_->recommendedMaxWorkingSetSize()),
+  block_limit_ =
-      max_pool_size_(block_limit_) {}
+      std::min(1.5 * device_->recommendedMaxWorkingSetSize(), 0.95 * memsize);
  gc_limit_ = std::min(
      static_cast<size_t>(0.95 * device_->recommendedMaxWorkingSetSize()),
      block_limit_);
  max_pool_size_ = block_limit_;
 }
 size_t MetalAllocator::set_cache_limit(size_t limit) {
  std::swap(limit, max_pool_size_);
@@ -165,6 +170,15 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
    return Buffer{nullptr};
  }
  // More helpful message if maximum buffer length is exceeded
  if (size > device_->maxBufferLength()) {
    std::ostringstream msg;
    msg << "Attempting to allocate " << size << " bytes which is greater than"
        << " the maximum allowed buffer size of " << device_->maxBufferLength()
        << " bytes.";
    throw std::runtime_error(msg.str());
  }
  // Align up memory
  if (size > vm_page_size) {
    size = vm_page_size * ((size + vm_page_size - 1) / vm_page_size);
@@ -209,6 +223,11 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
  return Buffer{static_cast<void*>(buf)};
 }
 void MetalAllocator::clear_cache() {
  std::unique_lock lk(mutex_);
  buffer_cache_.clear();
 }
 void MetalAllocator::free(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
  std::unique_lock lk(mutex_);
@@ -223,8 +242,17 @@ void MetalAllocator::free(Buffer buffer) {
 }
 MetalAllocator& allocator() {
-  static MetalAllocator allocator_;
+  // By creating the |allocator_| on heap, the destructor of MetalAllocator will
-  return allocator_;
+  // not be called on exit and all the buffers will be leaked. This is necessary
  // because releasing buffers can take more than 30sec when the program holds a
  // lot of RAM (for example inferencing a LLM), and it would feel frozen to
  // users when exiting.
  // TODO(zcbenz): Consider using the `base::NoDestructor` class from Chromium
  // when applying this pattern to more places, or when introducing sanitizers
  // to MLX.
  // https://source.chromium.org/chromium/chromium/src/+/main:base/no_destructor.h
  static MetalAllocator* allocator_ = new MetalAllocator;
  return *allocator_;
 }
 size_t set_cache_limit(size_t limit) {
@@ -239,9 +267,15 @@ size_t get_active_memory() {
 size_t get_peak_memory() {
  return allocator().get_peak_memory();
 }
 void reset_peak_memory() {
  allocator().reset_peak_memory();
 }
 size_t get_cache_memory() {
  return allocator().get_cache_memory();
 }
 void clear_cache() {
  return allocator().clear_cache();
 }
 } // namespace metal
--- a/mlx/backend/metal/allocator.h
+++ b/mlx/backend/metal/allocator.h
@@ -26,6 +26,7 @@ class BufferCache {
  size_t cache_size() {
    return pool_size_;
  }
  void clear();
 private:
  struct BufferHolder {
@@ -37,7 +38,6 @@ class BufferCache {
    MTL::Buffer* buf;
  };
  void clear();
  void add_at_head(BufferHolder* to_add);
  void remove_from_list(BufferHolder* to_remove);
@@ -62,11 +62,16 @@ class MetalAllocator : public allocator::Allocator {
  size_t get_peak_memory() {
    return peak_memory_;
  };
  void reset_peak_memory() {
    std::unique_lock lk(mutex_);
    peak_memory_ = 0;
  };
  size_t get_cache_memory() {
    return buffer_cache_.cache_size();
  };
  size_t set_cache_limit(size_t limit);
  size_t set_memory_limit(size_t limit, bool relaxed);
  void clear_cache();
 private:
  MTL::Device* device_;
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -0,0 +1,296 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"
 #define BINARY_GPU(func)                                              \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    binary_op_gpu(inputs, out, get_primitive_string(this));           \
  }
 #define BINARY_GPU_MULTI(func)                                         \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    binary_op_gpu(inputs, outputs, get_primitive_string(this));        \
  }
 namespace mlx::core {
 constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;
 std::string get_kernel_name(
    BinaryOpType bopt,
    const std::string& op,
    const array& a,
    bool use_2d,
    int ndim) {
  std::ostringstream kname;
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
      kname << "ss";
      break;
    case BinaryOpType::ScalarVector:
      kname << (use_2d ? "sv2" : "sv");
      break;
    case BinaryOpType::VectorScalar:
      kname << (use_2d ? "vs2" : "vs");
      break;
    case BinaryOpType::VectorVector:
      kname << (use_2d ? "vv2" : "vv");
      break;
    case BinaryOpType::General:
      kname << "g";
      if (ndim <= MAX_BINARY_SPECIALIZED_DIMS) {
        kname << ndim;
      } else {
        kname << "n";
      }
      break;
  }
  kname << op << type_to_name(a);
  return kname.str();
 }
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::string& op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& out = outputs[0];
  if (out.size() == 0) {
    return;
  }
  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
  auto& strides_a = strides[0];
  auto& strides_b = strides[1];
  auto& strides_out = strides[2];
  bool use_2d = out.data_size() > UINT32_MAX;
  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
  auto& d = metal::device(s.device);
  auto kernel =
      get_binary_two_kernel(d, kernel_name, a.dtype(), outputs[0].dtype(), op);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  // - If a is donated it goes to the first output
  // - If b is donated it goes to the first output if a was not donated
  //   otherwise it goes to the second output
  bool donate_a = a.data_shared_ptr() == nullptr;
  bool donate_b = b.data_shared_ptr() == nullptr;
  compute_encoder.set_input_array(donate_a ? outputs[0] : a, 0);
  compute_encoder.set_input_array(
      donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, 1);
  compute_encoder.set_output_array(outputs[0], 2);
  compute_encoder.set_output_array(outputs[1], 3);
  if (bopt == BinaryOpType::General) {
    auto ndim = shape.size();
    if (ndim > 3) {
      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 4);
      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 5);
      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 6);
    } else {
      // The shape is implicit in the grid for <= 3D
      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
    }
    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
      compute_encoder->setBytes(&ndim, sizeof(int), 7);
    }
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    // Launch a 1D or 2D grid of threads
    size_t nthreads = out.data_size();
    MTL::Size grid_dims = use_2d
        ? get_2d_grid_dims(outputs[0].shape(), outputs[0].strides())
        : MTL::Size(nthreads, 1, 1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::string& op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, outputs[0], bopt, true);
  set_binary_op_output_data(a, b, outputs[1], bopt, true);
  binary_op_gpu_inplace(inputs, outputs, op, s);
 }
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::string& op) {
  auto& s = outputs[0].primitive().stream();
  binary_op_gpu(inputs, outputs, op, s);
 }
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const std::string& op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  if (out.size() == 0) {
    return;
  }
  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
  auto& strides_a = strides[0];
  auto& strides_b = strides[1];
  auto& strides_out = strides[2];
  bool use_2d = out.data_size() > UINT32_MAX;
  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
  auto& d = metal::device(s.device);
  auto kernel = get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  bool donate_a = a.data_shared_ptr() == nullptr;
  bool donate_b = b.data_shared_ptr() == nullptr;
  compute_encoder.set_input_array(donate_a ? out : a, 0);
  compute_encoder.set_input_array(donate_b ? out : b, 1);
  compute_encoder.set_output_array(out, 2);
  if (bopt == BinaryOpType::General) {
    auto ndim = shape.size();
    if (ndim > 3) {
      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 3);
      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
    } else {
      // The shape is implicit in the grid for <= 3D
      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 3);
      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 4);
    }
    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
      compute_encoder->setBytes(&ndim, sizeof(int), 6);
    }
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    // Launch a 1D or 2D grid of threads
    size_t nthreads = out.data_size();
    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
                                 : MTL::Size(nthreads, 1, 1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const std::string& op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt, true);
  binary_op_gpu_inplace(inputs, out, op, s);
 }
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const std::string& op) {
  auto& s = out.primitive().stream();
  binary_op_gpu(inputs, out, op, s);
 }
 BINARY_GPU(Add)
 BINARY_GPU(ArcTan2)
 BINARY_GPU(Divide)
 BINARY_GPU_MULTI(DivMod)
 BINARY_GPU(Remainder)
 BINARY_GPU(Equal)
 BINARY_GPU(Greater)
 BINARY_GPU(GreaterEqual)
 BINARY_GPU(Less)
 BINARY_GPU(LessEqual)
 BINARY_GPU(LogicalAnd)
 BINARY_GPU(LogicalOr)
 BINARY_GPU(LogAddExp)
 BINARY_GPU(Maximum)
 BINARY_GPU(Minimum)
 BINARY_GPU(Multiply)
 BINARY_GPU(NotEqual)
 BINARY_GPU(Power)
 BINARY_GPU(Subtract)
 void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  switch (op_) {
    case BitwiseBinary::And:
      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::Or:
      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::Xor:
      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::LeftShift:
      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
    case BitwiseBinary::RightShift:
      binary_op_gpu(inputs, out, get_primitive_string(this));
      break;
  }
 }
 } // namespace mlx::core
--- a/mlx/backend/metal/binary.h
+++ b/mlx/backend/metal/binary.h
@@ -0,0 +1,33 @@
 // Copyright © 2024 Apple Inc.
 #pragma once
 #include "mlx/array.h"
 namespace mlx::core {
 void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::string& op,
    const Stream& s);
 void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const std::string& op,
    const Stream& s);
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::string& op,
    const Stream& s);
 void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const std::string& op,
    const Stream& s);
 } // namespace mlx::core
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -4,8 +4,8 @@
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/metal/compiled_preamble.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"
@@ -56,12 +56,15 @@ inline void build_kernel(
    } else {
      add_indices = true;
      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
-         << " [[buffer(" << cnt++ << ")]]," << std::endl
+         << " [[buffer(" << cnt++ << ")]]," << std::endl;
         << "    constant const size_t* " << xname << "_strides [[buffer("
         << cnt++ << ")]]," << std::endl;
    }
  }
  if (add_indices) {
    os << "    constant const size_t* in_strides [[buffer(" << cnt++
       << ")]],\n";
  }
  // Add the output arguments
  for (auto& x : outputs) {
    os << "    device " << get_type_string(x.dtype()) << "* "
@@ -110,13 +113,17 @@ inline void build_kernel(
  }
  // Read the inputs in tmps
-  for (auto& x : inputs) {
+  int nc_in_count = 0;
  for (int i = 0; i < inputs.size(); ++i) {
    auto& x = inputs[i];
    auto& xname = namer.get_name(x);
    if (is_constant(x)) {
-      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
+      auto type_str = get_type_string(x.dtype());
      os << "  auto tmp_" << xname << " = static_cast<"
         << get_type_string(x.dtype()) << ">(";
      print_constant(os, x);
-      os << ";" << std::endl;
+      os << ");" << std::endl;
    } else if (is_scalar(x)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[0];" << std::endl;
@@ -124,17 +131,20 @@ inline void build_kernel(
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[index];" << std::endl;
    } else if (!dynamic_dims) {
      int offset = nc_in_count * ndim;
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[";
-      os << "index_0 * " << xname << "_strides[0]";
+      os << "index_0 * " << "in_strides[" << offset << "]";
      for (int i = 1; i < ndim; i++) {
-        os << " + index_" << i << " * " << xname << "_strides[" << i << "]";
+        os << " + index_" << i << " * " << "in_strides[" << offset + i << "]";
      }
      os << "];" << std::endl;
      nc_in_count++;
    } else {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[elem_to_loc(index, output_shape, " << xname
+         << xname << "[elem_to_loc(index, output_shape, in_strides + "
-         << "_strides, ndim)];" << std::endl;
+         << nc_in_count * ndim << ", ndim)];" << std::endl;
      nc_in_count++;
    }
  }
@@ -190,7 +200,8 @@ void Compiled::eval_gpu(
  // If not we have to build it ourselves
  if (lib == nullptr) {
    std::ostringstream kernel;
-    kernel << metal::get_kernel_preamble() << std::endl;
+    kernel << metal::utils() << metal::unary_ops() << metal::binary_ops()
           << metal::ternary_ops();
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
@@ -295,6 +306,7 @@ void Compiled::eval_gpu(
  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
  std::vector<size_t> in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
@@ -302,13 +314,17 @@ void Compiled::eval_gpu(
    auto& x = inputs[i];
    compute_encoder.set_input_array(x, cnt++);
    if (!contiguous && !is_scalar(x)) {
-      compute_encoder->setBytes(
+      in_strides.insert(
-          strides[stride_idx].data(),
+          in_strides.end(),
-          strides[stride_idx].size() * sizeof(size_t),
+          strides[stride_idx].begin(),
-          cnt++);
+          strides[stride_idx].end());
      stride_idx++;
    }
  }
  if (!in_strides.empty()) {
    compute_encoder->setBytes(
        in_strides.data(), in_strides.size() * sizeof(size_t), cnt++);
  }
  compiled_allocate_outputs(
      inputs, outputs, inputs_, constant_ids_, contiguous, true);
@@ -336,7 +352,7 @@ void Compiled::eval_gpu(
    MTL::Size grid_dims(nthreads, 1, 1);
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
@@ -347,7 +363,7 @@ void Compiled::eval_gpu(
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }
--- a/mlx/backend/metal/compiled_preamble.h
+++ b/mlx/backend/metal/compiled_preamble.h
@@ -1,9 +0,0 @@
 // Copyright © 2023-24 Apple Inc.
 #pragma once
 namespace mlx::core::metal {
 const char* get_kernel_preamble();
 }
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -7,6 +7,7 @@
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/kernels/steel/conv/params.h"
 #include "mlx/backend/metal/matmul.h"
@@ -59,7 +60,7 @@ void explicit_gemm_conv_ND_gpu(
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatchThreads(grid_dims, group_dims);
  // Reshape weight
  std::vector<int> wt_reshape{implicit_K, implicit_N};
@@ -89,6 +90,90 @@ void explicit_gemm_conv_ND_gpu(
      /*copies = */ copies);
 }
 template <int N>
 void explicit_gemm_conv_group_ND_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array out,
    const MLXConvParams<N>& conv_params) {
  const int groups = conv_params.groups;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;
  // Get gemm shapes
  const int implicit_M = out.size() / conv_params.O;
  const int implicit_K = wt.size() / conv_params.O;
  const int implicit_N = O_per_group;
  int kernel_size = 1;
  for (int i = 0; i < N; ++i) {
    kernel_size *= conv_params.wS[i];
  }
  // Prepare unfolding array
  std::vector<int> unfolded_shape{implicit_M, implicit_K * groups};
  array in_unfolded(unfolded_shape, in.dtype(), nullptr, {});
  in_unfolded.set_data(allocator::malloc_or_wait(in_unfolded.nbytes()));
  // Prepare unfolding kernel
  std::ostringstream kname;
  kname << "naive_unfold_transpose_nd_" << type_to_name(in_unfolded) << "_"
        << N;
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
  compute_encoder->setComputePipelineState(kernel);
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(in_unfolded, 1);
  compute_encoder->setBytes(&conv_params, sizeof(conv_params), 2);
  // Launch unfolding kernel
  int tgp_x = std::min(conv_params.C, 64);
  tgp_x = 32 * ((tgp_x + 32 - 1) / 32);
  int tgp_y = 256 / tgp_x;
  MTL::Size group_dims = MTL::Size(tgp_x, tgp_y, 1);
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
  compute_encoder.dispatchThreads(grid_dims, group_dims);
  // Transpose kernel weights so that we can slice them by contiguous chunks
  // of channel groups.
  array wt_view(
      {wt.shape(0), C_per_group, kernel_size}, wt.dtype(), nullptr, {});
  wt_view.copy_shared_buffer(
      wt,
      {wt.strides(0), 1, static_cast<size_t>(C_per_group)},
      wt.flags(),
      wt.size());
  // Materialize
  auto wt_transpose = array(wt_view.shape(), wt_view.dtype(), nullptr, {});
  copy_gpu(wt_view, wt_transpose, CopyType::General, s);
  // Perform gemm
  std::vector<array> copies = {in_unfolded, wt_view, wt_transpose};
  return steel_matmul_conv_groups(
      s,
      d,
      /*a = */ in_unfolded,
      /*b = */ wt_transpose,
      /*c = */ out,
      /*M = */ implicit_M,
      /*N = */ implicit_N,
      /*K = */ implicit_K,
      /*a_cols = */ implicit_K * groups,
      /*b_cols = */ implicit_K,
      /*out_cols = */ implicit_N * groups,
      /*a_transposed = */ false,
      /*b_transposed = */ true,
      /* groups = */ groups,
      /*copies = */ copies);
 }
 void conv_1D_gpu(
    const Stream& s,
    metal::Device& d,
@@ -99,6 +184,7 @@ void conv_1D_gpu(
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    int groups,
    bool flip) {
  // Make conv params
  MLXConvParams<1> conv_params{
@@ -118,12 +204,16 @@ void conv_1D_gpu(
      {wt.strides()[0], wt.strides()[1], wt.strides()[2]},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides()[0], out.strides()[1], out.strides()[2]},
-      /* const int groups = */ 1,
+      /* const int groups = */ groups,
      /* const bool flip = */ flip};
  // Direct to explicit gemm conv
  if (groups > 1) {
    return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
  } else {
    return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
  }
 }
 void slow_conv_2D_gpu(
    const Stream& s,
@@ -158,7 +248,7 @@ void slow_conv_2D_gpu(
  compute_encoder.set_output_array(out, 2);
  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }
 void implicit_gemm_conv_2D_gpu(
@@ -168,15 +258,19 @@ void implicit_gemm_conv_2D_gpu(
    const array& wt,
    array out,
    const MLXConvParams<2>& conv_params) {
  const int groups = conv_params.groups;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;
  // Deduce implicit gemm size
-  int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
+  const int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
-  int implicit_N = conv_params.O;
+  const int implicit_N = O_per_group;
-  int implicit_K = conv_params.wS[0] * conv_params.wS[1] * conv_params.C;
+  const int implicit_K = conv_params.wS[0] * conv_params.wS[1] * C_per_group;
  // Determine block and warp tiles
  int wm = 2, wn = 2;
-  int bm = implicit_M >= 8192 && conv_params.C >= 64 ? 64 : 32;
+  int bm = implicit_M >= 8192 && C_per_group >= 64 ? 64 : 32;
  int bn = (bm == 64 || implicit_N >= 64) ? 64 : 32;
  int bk = 16;
@@ -192,15 +286,15 @@ void implicit_gemm_conv_2D_gpu(
  // Fix small channel specialization
  int n_channel_specialization = 0;
-  int channel_k_iters = ((conv_params.C + bk - 1) / bk);
+  int channel_k_iters = ((C_per_group + bk - 1) / bk);
  int gemm_k_iters = conv_params.wS[0] * conv_params.wS[1] * channel_k_iters;
-  if (conv_params.C <= 2) {
+  if (C_per_group <= 2) {
    gemm_k_iters = (implicit_K + bk - 1) / bk;
-    n_channel_specialization = conv_params.C;
+    n_channel_specialization = C_per_group;
-  } else if (conv_params.C <= 4) {
+  } else if (C_per_group <= 4) {
    gemm_k_iters = ((conv_params.wS[0] * conv_params.wS[1] * 4) + bk - 1) / bk;
-    n_channel_specialization = conv_params.C;
+    n_channel_specialization = C_per_group;
  }
  bool small_filter = (!n_channel_specialization) &&
@@ -242,7 +336,17 @@ void implicit_gemm_conv_2D_gpu(
  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel = get_steel_conv_kernel(
      d,
      kname.str(),
      out,
      bm,
      bn,
      bk,
      wm,
      wn,
      n_channel_specialization,
      small_filter);
  compute_encoder->setComputePipelineState(kernel);
  // Deduce grid launch dimensions
@@ -251,7 +355,7 @@ void implicit_gemm_conv_2D_gpu(
  size_t grid_dim_x = tn * tile;
  MTL::Size group_dims = MTL::Size(32, wn, wm);
-  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, 1);
+  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, groups);
  // Encode arrays
  compute_encoder.set_input_array(in, 0);
@@ -263,7 +367,7 @@ void implicit_gemm_conv_2D_gpu(
  compute_encoder->setBytes(&gemm_params, sizeof(ImplicitGemmConv2DParams), 4);
  // Launch kernel
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }
 void implicit_gemm_conv_2D_general_gpu(
@@ -395,7 +499,8 @@ void implicit_gemm_conv_2D_general_gpu(
  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = d.get_kernel(kname.str());
+  auto kernel =
      get_steel_conv_general_kernel(d, kname.str(), out, bm, bn, bk, wm, wn);
  compute_encoder->setComputePipelineState(kernel);
  // Deduce grid launch dimensions
@@ -423,7 +528,7 @@ void implicit_gemm_conv_2D_general_gpu(
      base_w.data(), sizeof(Conv2DGeneralBaseInfo) * base_w.size(), 7);
  // Launch kernel
-  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
 }
 void winograd_conv_2D_gpu(
@@ -524,7 +629,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, bo, 1);
    MTL::Size grid_dims = MTL::Size(O_c / bo, 1, 1);
-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }
  // Do input transform
@@ -552,7 +657,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);
-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }
  // Do batched gemm
@@ -600,7 +705,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);
-    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
  }
 }
@@ -614,6 +719,7 @@ void conv_2D_gpu(
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    const int groups,
    bool flip,
    std::vector<array>& copies) {
  // Make conv params
@@ -629,12 +735,12 @@ void conv_2D_gpu(
      /* const int kdil[NDIM] = */ {wt_dilation[0], wt_dilation[1]},
      /* const int idil[NDIM] = */ {in_dilation[0], in_dilation[1]},
      /* const size_t in_strides[NDIM + 2] = */
-      {in.strides()[0], in.strides()[1], in.strides()[2], in.strides()[3]},
+      {in.strides(0), in.strides(1), in.strides(2), in.strides(3)},
      /* const size_t wt_strides[NDIM + 2] = */
-      {wt.strides()[0], wt.strides()[1], wt.strides()[2], wt.strides()[3]},
+      {wt.strides(0), wt.strides(1), wt.strides(2), wt.strides(3)},
      /* const size_t out_strides[NDIM + 2] = */
-      {out.strides()[0], out.strides()[1], out.strides()[2], out.strides()[3]},
+      {out.strides(0), out.strides(1), out.strides(2), out.strides(3)},
-      /* const int groups = */ 1,
+      /* const int groups = */ groups,
      /* const bool flip = */ flip,
  };
@@ -646,6 +752,18 @@ void conv_2D_gpu(
  bool channels_large = (conv_params.C + conv_params.O) >= 512;
  bool channels_med = (conv_params.C + conv_params.O) >= 256;
  if (groups > 1) {
    const int C_per_group = conv_params.C / groups;
    const int O_per_group = conv_params.O / groups;
    if (is_idil_one && (C_per_group <= 4 || C_per_group % 16 == 0) &&
        (O_per_group <= 16 || O_per_group % 16 == 0)) {
      return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
    } else {
      return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
    }
  }
  // Direct to winograd conv
  if (!flip && is_stride_one && is_kdil_one && is_idil_one &&
      conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
@@ -670,6 +788,56 @@ void conv_2D_gpu(
  }
 }
 void conv_3D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    std::vector<array>& copies) {
  // Make conv params
  MLXConvParams<3> conv_params{
      /* const int  N = */ in.shape(0),
      /* const int  C = */ in.shape(4),
      /* const int  O = */ wt.shape(0),
      /* const int iS[NDIM] = */ {in.shape(1), in.shape(2), in.shape(3)},
      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2), wt.shape(3)},
      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2), out.shape(3)},
      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1], wt_strides[2]},
      /* const int pad[NDIM] = */ {padding[0], padding[1], padding[2]},
      /* const int kdil[NDIM] = */
      {wt_dilation[0], wt_dilation[1], wt_dilation[2]},
      /* const int idil[NDIM] = */
      {in_dilation[0], in_dilation[1], in_dilation[2]},
      /* const size_t in_strides[NDIM + 2] = */
      {in.strides()[0],
       in.strides()[1],
       in.strides()[2],
       in.strides()[3],
       in.strides()[4]},
      /* const size_t wt_strides[NDIM + 2] = */
      {wt.strides()[0],
       wt.strides()[1],
       wt.strides()[2],
       wt.strides()[3],
       wt.strides()[4]},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides()[0],
       out.strides()[1],
       out.strides()[2],
       out.strides()[3],
       out.strides()[4]},
      /* const int groups = */ 1,
      /* const bool flip = */ flip,
  };
  return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
 }
 } // namespace
 void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -694,8 +862,23 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
    wt = arr_copy;
  }
  // 3D conv
  if (out.ndim() == 5) {
    conv_3D_gpu(
        s,
        d,
        in,
        wt,
        out,
        padding_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        flip_,
        copies);
  }
  // 2D conv
-  if (out.ndim() == 4) {
+  else if (out.ndim() == 4) {
    conv_2D_gpu(
        s,
        d,
@@ -706,6 +889,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_,
        copies);
  }
@@ -721,6 +905,7 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_);
  }
  // Throw error
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -4,12 +4,14 @@
 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"
 namespace mlx::core {
 constexpr int MAX_COPY_SPECIALIZED_DIMS = 5;
 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
@@ -31,9 +33,6 @@ void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
  if (out.size() == 0) {
    return;
  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
@@ -55,34 +54,46 @@ void copy_gpu_inplace(
    int64_t out_offset,
    CopyType ctype,
    const Stream& s) {
  if (out.size() == 0) {
    return;
  }
  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(
      data_shape, std::vector{strides_in_pre, strides_out_pre});
  auto& strides_in_ = strides[0];
  auto& strides_out_ = strides[1];
  bool use_2d = out.data_size() > UINT32_MAX;
  auto& d = metal::device(s.device);
  std::string kernel_name;
  {
    std::ostringstream kname;
    switch (ctype) {
      case CopyType::Scalar:
-      kname << "scopy";
+        kname << (use_2d ? "s2" : "s");
        break;
      case CopyType::Vector:
-      kname << "vcopy";
+        kname << (use_2d ? "v2" : "v");
        break;
      case CopyType::General:
-      kname << "gcopy";
+        kname << "g";
        break;
      case CopyType::GeneralGeneral:
-      kname << "ggcopy";
+        kname << "gg";
        break;
    }
  kname << type_to_name(in) << type_to_name(out);
    if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
        shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
-    kname << "_" << shape.size();
+      kname << shape.size();
    }
-  auto kernel = d.get_kernel(kname.str());
+    kname << "_copy";
    kname << type_to_name(in) << type_to_name(out);
    kernel_name = kname.str();
  }
  auto kernel = get_copy_kernel(d, kernel_name, in, out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  bool donate_in = in.data_shared_ptr() == nullptr;
@@ -106,7 +117,7 @@ void copy_gpu_inplace(
      set_vector_bytes(compute_encoder, strides_out, ndim, 4);
    }
-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
+    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
      compute_encoder->setBytes(&ndim, sizeof(int), 5);
    }
@@ -126,16 +137,17 @@ void copy_gpu_inplace(
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  } else {
    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
+    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
                                 : MTL::Size(nthreads, 1, 1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatchThreads(grid_dims, group_dims);
  }
 }
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -5,6 +5,8 @@
 #include <filesystem>
 #include <sstream>
 #include <sys/sysctl.h>
 #define NS_PRIVATE_IMPLEMENTATION
 #define CA_PRIVATE_IMPLEMENTATION
 #define MTL_PRIVATE_IMPLEMENTATION
@@ -12,7 +14,6 @@
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/metal.h"
 #include "mlx/backend/metal/metal_impl.h"
 #include "mlx/backend/metal/mps/gemm.h"
 #include "mlx/backend/metal/utils.h"
 namespace fs = std::filesystem;
@@ -23,9 +24,34 @@ namespace {
 // TODO nicer way to set this or possibly expose as an environment variable
 constexpr int MAX_BUFFERS_PER_QUEUE = 12;
 constexpr int MAX_DISPATCHES_PER_ENCODER = 2;
 constexpr const char* default_mtllib_path = METAL_PATH;
 constexpr auto get_metal_version() {
 #if (MLX_METAL_VERSION >= 320)
  return MTL::LanguageVersion3_2;
 #elif (MLX_METAL_VERSION >= 310)
  return MTL::LanguageVersion3_1;
 #else
  return MTL::LanguageVersion3_0;
 #endif
 }
 std::string get_colocated_mtllib_path(const std::string& lib_name) {
  Dl_info info;
  std::string mtllib_path;
  std::string lib_ext = lib_name + ".metallib";
  int success = dladdr((void*)get_colocated_mtllib_path, &info);
  if (success) {
    auto mtllib = fs::path(info.dli_fname).remove_filename() / lib_ext;
    mtllib_path = mtllib.c_str();
  }
  return mtllib_path;
 }
 auto load_device() {
  auto devices = MTL::CopyAllDevices();
  auto device = static_cast<MTL::Device*>(devices->object(0))
@@ -35,7 +61,6 @@ auto load_device() {
  }
  return device;
 }
 std::pair<MTL::Library*, NS::Error*> load_library_from_path(
    MTL::Device* device,
    const char* path) {
@@ -114,6 +139,76 @@ MTL::Library* load_library(
 } // namespace
 CommandEncoder::CommandEncoder(MTL::CommandBuffer* cbuf) : cbuf(cbuf) {
  enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
  enc->retain();
 }
 CommandEncoder::~CommandEncoder() {
  enc->endEncoding();
  enc->release();
 }
 void CommandEncoder::set_input_array(
    const array& a,
    int idx,
    int64_t offset /* = 0 */) {
  auto r_buf = static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
  if (auto it = outputs.find(r_buf); it != outputs.end()) {
    // Insert a barrier
    enc->memoryBarrier(&r_buf, 1);
    // Remove the output
    outputs.erase(it);
  }
  auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
  auto base_offset = a.data<char>() -
      static_cast<char*>(const_cast<MTL::Buffer*>(a_buf)->contents());
  base_offset += offset;
  enc->setBuffer(a_buf, base_offset, idx);
 }
 void CommandEncoder::set_output_array(
    array& a,
    int idx,
    int64_t offset /* = 0 */) {
  // Add barriers before adding the output to the output set
  set_input_array(a, idx, offset);
  auto buf = static_cast<MTL::Resource*>(a.buffer().ptr());
  if (concurrent) {
    concurrent_outputs.insert(buf);
  } else {
    outputs.insert(buf);
  }
 }
 void CommandEncoder::dispatchThreadgroups(
    MTL::Size grid_dims,
    MTL::Size group_dims) {
  num_dispatches++;
  enc->dispatchThreadgroups(grid_dims, group_dims);
  maybe_split();
 }
 void CommandEncoder::dispatchThreads(
    MTL::Size grid_dims,
    MTL::Size group_dims) {
  num_dispatches++;
  enc->dispatchThreads(grid_dims, group_dims);
  maybe_split();
 }
 void CommandEncoder::maybe_split() {
  if (num_dispatches > MAX_DISPATCHES_PER_ENCODER && !concurrent) {
    enc->endEncoding();
    enc->release();
    num_dispatches = 0;
    outputs.clear();
    enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
    enc->retain();
  }
 }
 Device::Device() {
  auto pool = new_scoped_memory_pool();
  device_ = load_device();
@@ -128,9 +223,6 @@ Device::~Device() {
  for (auto& b : buffer_map_) {
    b.second.second->release();
  }
  for (auto& e : encoder_map_) {
    e.second->release();
  }
  for (auto& k : kernel_map_) {
    k.second->release();
  }
@@ -167,10 +259,7 @@ void Device::increment_command_buffer_ops(int index) {
 MTL::CommandBuffer* Device::get_command_buffer(int index) {
  auto bit = buffer_map_.find(index);
-  return (bit == buffer_map_.end()) ? nullptr : bit->second.second;
+  if (bit == buffer_map_.end()) {
 }
 MTL::CommandBuffer* Device::new_command_buffer(int index) {
    auto qit = queue_map_.find(index);
    if (qit == queue_map_.end()) {
      throw std::runtime_error(
@@ -187,7 +276,9 @@ MTL::CommandBuffer* Device::new_command_buffer(int index) {
    // Increment ref count so the buffer is not garbage collected
    cb->retain();
-  return buffer_map_.insert({index, {0, cb}}).first->second.second;
+    bit = buffer_map_.insert({index, {0, cb}}).first;
  }
  return bit->second.second;
 }
 void Device::commit_command_buffer(int index) {
@@ -198,25 +289,17 @@ void Device::commit_command_buffer(int index) {
 }
 void Device::end_encoding(int index) {
-  auto eit = encoder_map_.find(index);
+  encoder_map_.erase(index);
  if (eit != encoder_map_.end()) {
    eit->second->endEncoding();
    eit->second->release();
    encoder_map_.erase(eit);
  }
 }
 CommandEncoder& Device::get_command_encoder(int index) {
  auto eit = encoder_map_.find(index);
  if (eit == encoder_map_.end()) {
    auto cb = get_command_buffer(index);
-    auto compute_encoder =
+    eit =
-        cb->computeCommandEncoder(MTL::DispatchTypeConcurrent);
+        encoder_map_.emplace(index, std::make_unique<CommandEncoder>(cb)).first;
    // Increment ref count so the buffer is not garbage collected
    compute_encoder->retain();
    eit = encoder_map_.emplace(index, CommandEncoder{compute_encoder}).first;
  }
-  return eit->second;
+  return *(eit->second);
 }
 void Device::register_library(
@@ -228,13 +311,9 @@ void Device::register_library(
  }
 }
-void Device::register_library(
+void Device::register_library(const std::string& lib_name) {
    const std::string& lib_name,
    const std::function<std::string(const std::string&)>& lib_path_func) {
  if (auto it = library_map_.find(lib_name); it == library_map_.end()) {
-    std::string new_lib_path = lib_path_func(lib_name);
+    register_library(lib_name, get_colocated_mtllib_path(lib_name));
    auto new_lib = load_library(device_, lib_name, new_lib_path.c_str());
    library_map_.insert({lib_name, new_lib});
  }
 }
@@ -244,7 +323,7 @@ MTL::Library* Device::get_library_cache_(const std::string& lib_name) {
  if (auto it = library_map_.find(lib_name); it != library_map_.end()) {
    mtl_lib = it->second;
  } else { // Look for metallib alongside library
-    register_library(lib_name);
+    register_library(lib_name, get_colocated_mtllib_path(lib_name));
    mtl_lib = library_map_[lib_name];
  }
@@ -258,13 +337,16 @@ MTL::Library* Device::get_library_(const std::string& source_string) {
      NS::String::string(source_string.c_str(), NS::ASCIIStringEncoding);
  NS::Error* error = nullptr;
-  auto mtl_lib = device_->newLibrary(ns_code, nullptr, &error);
+  auto options = MTL::CompileOptions::alloc()->init();
  options->setFastMathEnabled(false);
  options->setLanguageVersion(get_metal_version());
  auto mtl_lib = device_->newLibrary(ns_code, options, &error);
  options->release();
  // Throw error if unable to compile library
  if (!mtl_lib) {
    std::ostringstream msg;
-    msg << "[metal::Device] Unable to load build metal library from source"
+    msg << "[metal::Device] Unable to build metal library from source" << "\n";
        << "\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
@@ -283,8 +365,7 @@ MTL::Library* Device::get_library_(const MTL::StitchedLibraryDescriptor* desc) {
  // Throw error if unable to compile library
  if (!mtl_lib) {
    std::ostringstream msg;
-    msg << "[metal::Device] Unable to load build stitched metal library"
+    msg << "[metal::Device] Unable to build stitched metal library" << "\n";
        << "\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
@@ -342,7 +423,6 @@ MTL::Function* Device::get_function_(
  }
  mtl_func_consts->release();
  desc->release();
  return mtl_function;
 }
@@ -511,11 +591,13 @@ MTL::ComputePipelineState* Device::get_kernel(
  // Compile kernel to compute pipeline
  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
  auto kernel = get_kernel_(kname, mtl_function, mtl_linked_funcs);
  mtl_function->release();
  mtl_linked_funcs->release();
  // Add kernel to cache
  kernel_map_.insert({kname, kernel});
  return kernel;
 }
@@ -542,11 +624,12 @@ Device& device(mlx::core::Device) {
  return metal_device;
 }
-std::shared_ptr<void> new_scoped_memory_pool() {
+std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool() {
  auto dtor = [](void* ptr) {
    static_cast<NS::AutoreleasePool*>(ptr)->release();
  };
-  return std::shared_ptr<void>(NS::AutoreleasePool::alloc()->init(), dtor);
+  return std::unique_ptr<void, std::function<void(void*)>>(
      NS::AutoreleasePool::alloc()->init(), dtor);
 }
 void new_stream(Stream stream) {
@@ -555,4 +638,23 @@ void new_stream(Stream stream) {
  }
 }
 std::unordered_map<std::string, std::variant<std::string, size_t>>
 device_info() {
  auto raw_device = device(default_device()).mtl_device();
  auto arch = std::string(raw_device->architecture()->name()->utf8String());
  int mib[] = {CTL_HW, HW_MEMSIZE};
  size_t memsize = 0;
  size_t length = sizeof(memsize);
  sysctl(mib, 2, &memsize, &length, NULL, 0);
  return {
      {"architecture", arch},
      {"max_buffer_length", raw_device->maxBufferLength()},
      {"max_recommended_working_set_size",
       raw_device->recommendedMaxWorkingSetSize()},
      {"memory_size", memsize}};
 }
 } // namespace mlx::core::metal
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -9,36 +9,17 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <dlfcn.h>
 #include <filesystem>
 #include "mlx/array.h"
 #include "mlx/device.h"
 namespace fs = std::filesystem;
 namespace mlx::core::metal {
 inline std::string get_colocated_mtllib_path(const std::string& lib_name) {
  Dl_info info;
  std::string mtllib_path;
  std::string lib_ext = lib_name + ".metallib";
  int success = dladdr((void*)get_colocated_mtllib_path, &info);
  if (success) {
    auto mtllib = fs::path(info.dli_fname).remove_filename() / lib_ext;
    mtllib_path = mtllib.c_str();
  }
  return mtllib_path;
 }
 using MTLFCList =
    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;
 struct CommandEncoder {
-  CommandEncoder(MTL::ComputeCommandEncoder* enc)
+  CommandEncoder(MTL::CommandBuffer* cbuf);
-      : enc(enc), concurrent(false){};
+  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;
  struct ConcurrentContext {
@@ -60,41 +41,24 @@ struct CommandEncoder {
    return enc;
  }
-  void set_input_array(const array& a, int idx, int offset = 0) {
+  void set_input_array(const array& a, int idx, int64_t offset = 0);
-    auto r_buf =
+  void set_output_array(array& a, int idx, int64_t offset = 0);
-        static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
+  void dispatchThreadgroups(MTL::Size grid_dims, MTL::Size group_dims);
-    if (auto it = outputs.find(r_buf); it != outputs.end()) {
+  void dispatchThreads(MTL::Size grid_dims, MTL::Size group_dims);
      // Insert a barrier
      enc->memoryBarrier(&r_buf, 1);
      // Remove the output
      outputs.erase(it);
    }
    auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
    auto base_offset = a.data<char>() -
        static_cast<char*>(const_cast<MTL::Buffer*>(a_buf)->contents());
    base_offset += offset;
    enc->setBuffer(a_buf, base_offset, idx);
  }
  void set_output_array(array& a, int idx, int offset = 0) {
    // Add barriers before adding the output to the output set
    set_input_array(a, idx, offset);
    auto buf = static_cast<MTL::Resource*>(a.buffer().ptr());
    if (concurrent) {
      concurrent_outputs.insert(buf);
    } else {
      outputs.insert(buf);
    }
  }
  ConcurrentContext start_concurrent() {
    return ConcurrentContext(*this);
  }
  ~CommandEncoder();
 private:
  void maybe_split();
  int num_dispatches{0};
  MTL::CommandBuffer* cbuf;
  MTL::ComputeCommandEncoder* enc;
-  bool concurrent;
+  bool concurrent{false};
  std::unordered_set<MTL::Resource*> outputs;
  std::unordered_set<MTL::Resource*> concurrent_outputs;
 };
@@ -111,7 +75,6 @@ class Device {
  };
  void new_queue(int index);
  MTL::CommandBuffer* new_command_buffer(int index);
  MTL::CommandBuffer* get_command_buffer(int index);
  int get_command_buffer_ops(int index);
  void increment_command_buffer_ops(int index);
@@ -122,10 +85,8 @@ class Device {
  void register_library(
      const std::string& lib_name,
      const std::string& lib_path);
-  void register_library(
+
-      const std::string& lib_name,
+  void register_library(const std::string& lib_name);
      const std::function<std::string(const std::string&)>& lib_path_func =
          get_colocated_mtllib_path);
  MTL::Library* get_library(const std::string& name);
@@ -197,7 +158,7 @@ class Device {
  MTL::Device* device_;
  std::unordered_map<int32_t, MTL::CommandQueue*> queue_map_;
  std::unordered_map<int32_t, std::pair<int, MTL::CommandBuffer*>> buffer_map_;
-  std::unordered_map<int32_t, CommandEncoder> encoder_map_;
+  std::unordered_map<int32_t, std::unique_ptr<CommandEncoder>> encoder_map_;
  std::unordered_map<std::string, MTL::ComputePipelineState*> kernel_map_;
  std::unordered_map<std::string, MTL::Library*> library_map_;
  std::mutex mtx_;
--- a/mlx/backend/metal/event.cpp
+++ b/mlx/backend/metal/event.cpp
@@ -0,0 +1,30 @@
 // Copyright © 2024 Apple Inc.
 #include "mlx/event.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/metal_impl.h"
 namespace mlx::core {
 Event::Event(const Stream& stream) : stream_(stream) {
  auto dtor = [](void* ptr) {
    auto p = metal::new_scoped_memory_pool();
    static_cast<MTL::SharedEvent*>(ptr)->release();
  };
  auto p = metal::new_scoped_memory_pool();
  event_ = std::shared_ptr<void>(
      metal::device(stream.device).mtl_device()->newSharedEvent(), dtor);
 }
 void Event::wait() {
  if (!static_cast<MTL::SharedEvent*>(raw_event().get())
           ->waitUntilSignaledValue(value(), -1)) {
    throw std::runtime_error("[Event::wait] Timed out");
  }
 }
 void Event::signal() {
  static_cast<MTL::SharedEvent*>(raw_event().get())->setSignaledValue(value());
 }
 } // namespace mlx::core
--- a/Show More
+++ b/Show More
`@@ -2,4 +2,4 @@`

	`import mlx.core as mx`	`import mlx.core as mx`

	`from .mlx_sample_extensions import *`	`from ._ext import axpby`