Add locks to FileStream

Working IO primitives
Change Load to be an IOPrimitive
2025-09-12 23:34:36 +08:00 · 2024-05-08 23:19:27 -07:00 · 2024-05-08 22:17:25 -07:00 · 2024-05-08 18:59:20 -07:00 · 2024-05-08 18:02:22 -07:00 · 2024-05-07 16:58:14 -07:00
344 changed files with 13944 additions and 32112 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,7 +31,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -44,12 +44,16 @@ jobs:
          name: Generate package stubs
          command: |
            echo "stubs"
-            pip install typing_extensions
            python setup.py generate_stubs 
      - run:
          name: Run Python tests
          command: |
            python3 -m unittest discover python/tests -v
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3 -m pip install . 
      - run:
          name: Build CPP only
          command: |
@@ -65,19 +69,18 @@ jobs:
        default: "15.2.0"
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.medium.gen1
+    resource_class: macos.m1.large.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            brew install python@3.8
-            brew install openmpi
            python3.8 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -91,7 +94,6 @@ jobs:
          name: Generate package stubs
          command: |
            source env/bin/activate
-            pip install typing_extensions
            python setup.py generate_stubs 
      - run:
          name: Run Python tests
@@ -99,14 +101,11 @@ jobs:
            source env/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
-            mpirun -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-      - run:
-          name: Build example extension
-          command: |
-            source env/bin/activate
-            cd examples/extensions
-            pip install -r requirements.txt
-            python setup.py build_ext -j8
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3.11 -m pip install . 
      - store_test_results:
          path: test-results
      - run:
@@ -118,13 +117,7 @@ jobs:
          name: Run CPP tests
          command: |
            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
-      - run:
-          name: Build small binary
-          command: |
-            source env/bin/activate
-            cd build/
-            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel -DBUILD_SHARED_LIBS=ON -DMLX_BUILD_CPU=OFF -DMLX_BUILD_SAFETENSORS=OFF -DMLX_BUILD_GGUF=OFF -DMLX_METAL_JIT=ON
-            make -j
+            DEVICE=cpu ./build/tests/tests

  build_release:
    parameters:
@@ -139,19 +132,18 @@ jobs:
        default: ""
    macos:
      xcode: << parameters.xcode_version >>
-    resource_class: macos.m1.medium.gen1
+    resource_class: macos.m1.large.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
            brew install python@<< parameters.python_version >>
-            brew install openmpi
            python<< parameters.python_version >> -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -167,7 +159,6 @@ jobs:
          name: Generate package stubs
          command: |
            source env/bin/activate
-            pip install typing_extensions
            python setup.py generate_stubs 
      - run:
          name: Build Python package
@@ -216,7 +207,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
@@ -225,7 +216,6 @@ jobs:
            << parameters.extra_env >> \
              CMAKE_BUILD_PARALLEL_LEVEL="" \
              pip install . -v
-            pip install typing_extensions
            python setup.py generate_stubs 
            << parameters.extra_env >> \
              CMAKE_BUILD_PARALLEL_LEVEL="" \
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -17,4 +17,4 @@ jobs:
          pip install pre-commit black isort clang-format
      - name: Run lint
        run: |
-          pre-commit run --all-files
+          pre-commit run --all-files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v18.1.4
    hooks:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.8.0
+    rev: 24.4.2
    hooks:
    -   id: black
 -   repo: https://github.com/pycqa/isort
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -10,14 +10,12 @@ MLX was developed with contributions from the following individuals:
 - Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream` and safetensor support.
 - Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer. Implemented pooling layers and ``Upsample``.
 - Hinrik Snær Guðmundsson: Added `atleast_1d`, `atleast_2d`, `atleast_3d` ops.
 - Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
 - Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
 - AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
- Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
- Paul Paczuski: Improved stability of BCE loss calculation

 <a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,16 +15,12 @@ option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
 option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
-option(MLX_BUILD_CPU "Build cpu backend" ON)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
-option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
-option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
-option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.17.1)
+  set(MLX_VERSION 0.12.2)
 endif()

 # --------------------- Processor tests -------------------------
@@ -83,21 +79,22 @@ elseif (MLX_BUILD_METAL)
                  OUTPUT_VARIABLE MACOS_VERSION
                  COMMAND_ERROR_IS_FATAL ANY)

-  if (${MACOS_VERSION} LESS 14.0)
-    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
-  endif()
  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")

-  set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip)
-  # Get the metal version
-  execute_process(
-    COMMAND zsh "-c" "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal -E -x metal -P - | tail -1 | tr -d '\n'"
-    OUTPUT_VARIABLE MLX_METAL_VERSION
-    COMMAND_ERROR_IS_FATAL ANY)
+  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
+    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.2.diff)
+    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
+  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
+    set(METAL_CPP_PATCH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/metal.14.0.diff)
+    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
+  else()
+    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
+  endif()

  FetchContent_Declare(
    metal_cpp
    URL ${METAL_CPP_URL}
+    PATCH_COMMAND /usr/bin/patch -N -i ${METAL_CPP_PATCH} || true
  )

  FetchContent_MakeAvailable(metal_cpp)
@@ -107,85 +104,55 @@ elseif (MLX_BUILD_METAL)
    $<INSTALL_INTERFACE:include/metal_cpp>
  )
  target_link_libraries(
-    mlx PUBLIC
+    mlx
    ${METAL_LIB}
    ${FOUNDATION_LIB}
    ${QUARTZ_LIB})
-
-  add_compile_definitions("MLX_METAL_VERSION=${MLX_METAL_VERSION}")
 endif()

-if (MLX_BUILD_CPU)
-  find_library(ACCELERATE_LIBRARY Accelerate)
-  if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
-    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
-    set(MLX_BUILD_ACCELERATE ON)
-    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
-    add_compile_definitions(ACCELERATE_NEW_LAPACK)
-  else()
-    message(STATUS "Accelerate or arm neon not found, using default backend.")
-    set(MLX_BUILD_ACCELERATE OFF)
-    if(${CMAKE_HOST_APPLE})
-      # The blas shipped in macOS SDK is not supported, search homebrew for
-      # openblas instead.
-      set(BLA_VENDOR OpenBLAS)
-      set(LAPACK_ROOT "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
-    endif()
-    # Search and link with lapack.
-    find_package(LAPACK REQUIRED)
-    if (NOT LAPACK_FOUND)
-      message(FATAL_ERROR "Must have LAPACK installed")
-    endif()
-    find_path(LAPACK_INCLUDE_DIRS lapacke.h
-      /usr/include
-      /usr/local/include
-      /usr/local/opt/openblas/include)
-    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
-    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
-    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
-    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
-    # List blas after lapack otherwise we may accidentally incldue an old version
-    # of lapack.h from the include dirs of blas.
-    find_package(BLAS REQUIRED)
-    if (NOT BLAS_FOUND)
-      message(FATAL_ERROR "Must have BLAS installed")
-    endif()
-    # TODO find a cleaner way to do this
-    find_path(BLAS_INCLUDE_DIRS cblas.h
-      /usr/include
-      /usr/local/include
-      $ENV{BLAS_HOME}/include)
-    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
-    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
-    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
-  endif()
+find_library(ACCELERATE_LIBRARY Accelerate)
+if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
+  message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
+  set(MLX_BUILD_ACCELERATE ON)
+  target_link_libraries(mlx ${ACCELERATE_LIBRARY})
+  add_compile_definitions(ACCELERATE_NEW_LAPACK)
 else()
+  message(STATUS "Accelerate or arm neon not found, using default backend.")
  set(MLX_BUILD_ACCELERATE OFF)
-endif()
-
-find_package(MPI)
-if (MPI_FOUND)
-  execute_process(
-    COMMAND zsh "-c" "mpirun --version"
-    OUTPUT_VARIABLE MPI_VERSION
-    ERROR_QUIET
-  )
-  if (${MPI_VERSION} MATCHES ".*Open MPI.*")
-    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
-  elseif (MPI_VERSION STREQUAL "")
-    set(MPI_FOUND FALSE)
-    message(
-      WARNING
-      "MPI found but mpirun is not available. Building without MPI."
-    )
-  else()
-    set(MPI_FOUND FALSE)
-    message(
-      WARNING
-      "MPI which is not OpenMPI found. Building without MPI."
-    )
-  endif() 
+  if(${CMAKE_HOST_APPLE})
+    # The blas shipped in macOS SDK is not supported, search homebrew for
+    # openblas instead.
+    set(BLA_VENDOR OpenBLAS)
+    set(LAPACK_ROOT "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
+  endif()
+  # Search and link with lapack.
+  find_package(LAPACK REQUIRED)
+  if (NOT LAPACK_FOUND)
+    message(FATAL_ERROR "Must have LAPACK installed")
+  endif()
+  find_path(LAPACK_INCLUDE_DIRS lapacke.h
+    /usr/include
+    /usr/local/include
+    /usr/local/opt/openblas/include)
+  message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
+  message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+  target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+  target_link_libraries(mlx ${LAPACK_LIBRARIES})
+  # List blas after lapack otherwise we may accidentally incldue an old version
+  # of lapack.h from the include dirs of blas.
+  find_package(BLAS REQUIRED)
+  if (NOT BLAS_FOUND)
+    message(FATAL_ERROR "Must have BLAS installed")
+  endif()
+  # TODO find a cleaner way to do this
+  find_path(BLAS_INCLUDE_DIRS cblas.h
+    /usr/include
+    /usr/local/include
+    $ENV{BLAS_HOME}/include)
+  message(STATUS "Blas lib " ${BLAS_LIBRARIES})
+  message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
+  target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
+  target_link_libraries(mlx ${BLAS_LIBRARIES})
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)
@@ -197,14 +164,6 @@ target_include_directories(
  $<INSTALL_INTERFACE:include>
 )

-FetchContent_Declare(fmt
-  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-  GIT_TAG 10.2.1 
-  EXCLUDE_FROM_ALL
-)
-FetchContent_MakeAvailable(fmt)
-target_link_libraries(mlx PRIVATE fmt::fmt-header-only)
-
 if (MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
--- a/README.md
+++ b/README.md
@@ -88,13 +88,13 @@ for more information on building the C++ and Python APIs from source.

 ## Contributing 

-Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
+Check out the [contribution guidelines](CONTRIBUTING.md) for more information
 on contributing to MLX. See the
 [docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
 information on building from source, and running tests.

 We are grateful for all of [our
-contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
+contributors](ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
 to MLX and wish to be acknowledged, please add your name to the list in your
 pull request.

--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -185,7 +185,7 @@ def prelu(x: torch.Tensor) -> torch.Tensor:
 def mish(x: torch.Tensor) -> torch.Tensor:
    y = x
    for _ in range(100):
-        y = torch.nn.functional.mish(y)
+        return torch.nn.functional.mish(y)
    sync_if_needed(x)


@@ -283,14 +283,6 @@ def topk(axis, x):
    sync_if_needed(x)


-@torch.no_grad()
-def step_function(x):
-    y = x
-    for i in range(100):
-        y = torch.where(y < 0, 0, 1)
-    sync_if_needed(x)
-
-
@torch.no_grad()
 def selu(x):
    y = x
@@ -454,11 +446,5 @@ if __name__ == "__main__":
    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

-    elif args.benchmark == "step":
-        print(bench(step_function, x))
-
-    elif args.benchmark == "selu":
-        print(bench(selu, x))
-
    else:
-        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
+        raise ValueError("Unknown benchmark")
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -16,9 +16,7 @@ def run_or_raise(*args, **kwargs):
        result = run(*args, capture_output=True, **kwargs)
        return float(result.stdout)
    except ValueError:
-        raise ValueError(
-            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
-        )
+        raise ValueError(f"stdout: {result.stdout}\nstderr: {result.stderr}")


 def compare(args):
--- a/benchmarks/python/compile_bench.py
+++ b/benchmarks/python/compile_bench.py
@@ -9,6 +9,7 @@ from time_utils import time_fn


 def bench_gelu():
+
    def gelu(x):
        return x * (1 + mx.erf(x / math.sqrt(2))) / 2

@@ -50,6 +51,7 @@ def bench_gelu():


 def bench_layernorm():
+
    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    mx.eval(weight, bias)
--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -28,11 +28,11 @@ def bench(f, a, b):
    return (e - s) * 1e-9


-def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0)):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            y = mx.conv2d(a, b, stride=strides, padding=padding)
            ys.append(y)
        mx.eval(ys)
        return ys
@@ -40,12 +40,12 @@ def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    return mx_conv_2D


-def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0)):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
-            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            y = torch.conv2d(a, b, stride=strides, padding=padding)
            ys.append(y)
        torch.mps.synchronize()
        return ys
@@ -53,12 +53,11 @@ def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    return pt_conv_2D


-def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, np_dtype):
+
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
-    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
-        np_dtype
-    )
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, C)).astype(np_dtype)

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)
@@ -68,15 +67,15 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):

    torch.mps.synchronize()

-    f_mx = make_mx_conv_2D(strides, padding, groups)
-    f_pt = make_pt_conv_2D(strides, padding, groups)
+    f_mx = make_mx_conv_2D(strides, padding)
+    f_pt = make_pt_conv_2D(strides, padding)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

-    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding)
    out_pt = torch.conv2d(
-        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)
@@ -85,7 +84,7 @@ def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
-            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch
@@ -96,40 +95,35 @@ if __name__ == "__main__":

    dtypes = ("float32",)
    shapes = (
-        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
-        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
-        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
-        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
-        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
-        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
-        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
-        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
-        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
-        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2)),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2)),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2)),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2)),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2)),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2)),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2)),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2)),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2)),
    )

    for dtype in dtypes:
-        print(
-            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
-        )
-        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+        print("(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  diff%")
+        for N, H, W, C, kH, kW, O, strides, padding in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
-                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+                N, H, W, C, kH, kW, O, strides, padding, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
-                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/einsum_bench.py
+++ b/benchmarks/python/einsum_bench.py
@@ -1,84 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import time
-
-import mlx.core as mx
-import numpy as np
-
-
-def timeit(fn, its=100, args=[]):
-    for _ in range(5):
-        fn(*args)
-    tic = time.perf_counter()
-    for _ in range(its):
-        fn(*args)
-    toc = time.perf_counter()
-    return 1e3 * (toc - tic) / its
-
-
-def time_little_einsum_path():
-    subscripts = "ik,kj->ij"
-    x = mx.ones((32, 32))
-    y = mx.ones((32, 32))
-    mx_time = timeit(mx.einsum_path, args=(subscripts, x, y))
-
-    x = np.array(x)
-    y = np.array(y)
-    np_time = timeit(np.einsum_path, args=(subscripts, x, y))
-    print("Timing little einsum path...")
-    print(f"MLX ... {mx_time:.3f} ms")
-    print(f"NumPy... {np_time:.3f} ms")
-
-
-def time_big_einsum_path():
-    chars = list("abcdefgh")
-    char_to_dim = {c: v for v, c in enumerate(chars)}
-
-    num_inputs = 10
-    inputs = []
-    subscripts = []
-    for _ in range(num_inputs):
-        subscript = np.random.choice(chars, size=5, replace=False).tolist()
-        subscripts.append("".join(subscript))
-        inputs.append(np.ones(list(char_to_dim[c] for c in subscript)))
-    subscripts = ",".join(subscripts)
-
-    np_time = timeit(np.einsum_path, args=(subscripts, *inputs))
-
-    inputs = [mx.array(x) for x in inputs]
-    mx_time = timeit(mx.einsum_path, args=(subscripts, *inputs))
-    print("Timing big einsum path...")
-    print(f"MLX ... {mx_time:.3f} ms")
-    print(f"NumPy... {np_time:.3f} ms")
-
-
-def time_attention():
-    def regular_attention(x):
-        # shape [batch, sequence, num_heads, head_dim]
-        queries, keys, values = x, x, x
-        scores = queries.transpose(0, 2, 1, 3) @ keys.transpose(0, 2, 3, 1)
-        scores = mx.softmax(scores, axis=-1)
-        output = (scores @ values.transpose(0, 2, 1, 3)).swapaxes(1, 2)
-        mx.eval(output)
-
-    def einsum_attention(x):
-        # shape [batch, sequence, num_heads, head_dim]
-        queries, keys, values = x, x, x
-        scores = mx.einsum("itjk,iujk->ijtu", queries, keys)
-        scores = mx.softmax(scores, axis=-1)
-        output = mx.einsum("ijtu,iujk->itjk", scores, values)
-        mx.eval(output)
-
-    x = mx.random.uniform(shape=(8, 512, 32, 128))
-
-    regular_time = timeit(regular_attention, args=(x,))
-    ein_time = timeit(einsum_attention, args=(x,))
-    print("Timing einsum attention...")
-    print(f"Regular ... {regular_time:.3f} ms")
-    print(f"Einsum ... {ein_time:.3f} ms")
-
-
-if __name__ == "__main__":
-    time_little_einsum_path()
-    time_big_einsum_path()
-    time_attention()
--- a/benchmarks/python/fft_bench.py
+++ b/benchmarks/python/fft_bench.py
@@ -3,8 +3,6 @@
 import matplotlib
 import mlx.core as mx
 import numpy as np
-import sympy
-import torch
 from time_utils import measure_runtime

 matplotlib.use("Agg")
@@ -18,100 +16,41 @@ def bandwidth_gb(runtime_ms, system_size):
    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb


-def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
-    def fft_mlx(x):
-        if dim == 1:
-            out = mx.fft.fft(x)
-        elif dim == 2:
-            out = mx.fft.fft2(x)
+def run_bench(system_size):
+    def fft(x):
+        out = mx.fft.fft(x)
        mx.eval(out)
        return out

-    def fft_mps(x):
-        if dim == 1:
-            out = torch.fft.fft(x)
-        elif dim == 2:
-            out = torch.fft.fft2(x)
-        torch.mps.synchronize()
-        return out
-
    bandwidths = []
-    for n in fft_sizes:
-        batch_size = system_size // n**dim
-        shape = [batch_size] + [n for _ in range(dim)]
-        if backend == "mlx":
-            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
-            x = mx.array(x_np)
-            mx.eval(x)
-            fft = fft_mlx
-        elif backend == "mps":
-            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
-            x = torch.tensor(x_np, device="mps")
-            torch.mps.synchronize()
-            fft = fft_mps
-        else:
-            raise NotImplementedError()
+    for k in range(4, 12):
+        n = 2**k
+        x = mx.random.uniform(shape=(system_size // n, n)).astype(mx.float32)
+        x = x.astype(mx.complex64)
+        mx.eval(x)
        runtime_ms = measure_runtime(fft, x=x)
-        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
-        print(n, bandwidth)
-        bandwidths.append(bandwidth)
+        bandwidths.append(bandwidth_gb(runtime_ms, system_size))

-    return np.array(bandwidths)
+    return bandwidths


 def time_fft():
-    x = np.array(range(2, 512))
-    system_size = int(2**26)

-    print("MLX GPU")
-    with mx.stream(mx.gpu):
-        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
-
-    print("MPS GPU")
-    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")
-
-    print("CPU")
-    system_size = int(2**20)
    with mx.stream(mx.cpu):
-        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+        cpu_bandwidths = run_bench(system_size=int(2**22))

-    x = np.array(x)
+    with mx.stream(mx.gpu):
+        gpu_bandwidths = run_bench(system_size=int(2**29))

-    all_indices = x - x[0]
-    radix_2to13 = (
-        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
-    )
-    bluesteins = (
-        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
-    )
-
-    for indices, name in [
-        (all_indices, "All"),
-        (radix_2to13, "Radix 2-13"),
-        (bluesteins, "Bluestein's"),
-    ]:
-        # plot bandwidths
-        print(name)
-        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
-        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
-        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
-        plt.title(f"MLX FFT Benchmark -- {name}")
-        plt.xlabel("N")
-        plt.ylabel("Bandwidth (GB/s)")
-        plt.legend()
-        plt.savefig(f"{name}.png")
-        plt.clf()
-
-    av_gpu_bandwidth = np.mean(gpu_bandwidths)
-    av_mps_bandwidth = np.mean(mps_bandwidths)
-    av_cpu_bandwidth = np.mean(cpu_bandwidths)
-    print("Average bandwidths:")
-    print("GPU:", av_gpu_bandwidth)
-    print("MPS:", av_mps_bandwidth)
-    print("CPU:", av_cpu_bandwidth)
-
-    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
-    print("Percent MLX faster than MPS: ", portion_faster * 100)
+    # plot bandwidths
+    x = [2**k for k in range(4, 12)]
+    plt.scatter(x, gpu_bandwidths, color="green", label="GPU")
+    plt.scatter(x, cpu_bandwidths, color="red", label="CPU")
+    plt.title("MLX FFT Benchmark")
+    plt.xlabel("N")
+    plt.ylabel("Bandwidth (GB/s)")
+    plt.legend()
+    plt.savefig("fft_plot.png")


 if __name__ == "__main__":
--- a/benchmarks/python/hadamard_bench.py
+++ b/benchmarks/python/hadamard_bench.py
@@ -1,70 +0,0 @@
-import argparse
-
-import matplotlib
-import mlx.core as mx
-import numpy as np
-from time_utils import measure_runtime
-
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-
-
-def had(x):
-    y = mx.hadamard_transform(x)
-    mx.eval(y)
-
-
-def copy(x):
-    y = x + 1.0
-    mx.eval(y)
-
-
-def run(dtype):
-    system_size = 2**26
-    outputs = {}
-    for test_fn in (had, copy):
-        for m in [1, 12, 20, 28]:
-            if test_fn == copy:
-                key = "copy"
-            elif m == 1:
-                key = "had_2^k"
-            else:
-                key = "had_m*2^k"
-            outputs.setdefault(key, {})
-            for k in range(7, 14):
-                n = m * 2**k
-                if n > 2**15:
-                    continue
-                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
-                x = mx.array(x_np)
-                runtime_ms = measure_runtime(test_fn, x=x)
-                bytes_per_gb = 1e9
-                ms_per_s = 1e3
-                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
-                bandwidth_gb = (
-                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
-                )
-                print(n, bandwidth_gb)
-                outputs[key][n] = bandwidth_gb
-
-    colors = {
-        "copy": "black",
-        "had_2^k": "steelblue",
-        "had_m*2^k": "skyblue",
-    }
-    for key, output in outputs.items():
-        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
-    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
-    plt.xlabel("N")
-    plt.ylabel("Bandwidth (GB/s)")
-    plt.legend()
-    plt.savefig(f"bench_{dtype.__name__}.png")
-    plt.clf()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--fp16", action="store_true")
-    args = parser.parse_args()
-    dtype = np.float16 if args.fp16 else np.float32
-    run(dtype)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -1,62 +0,0 @@
-import argparse
-import math
-
-import mlx.core as mx
-from time_utils import time_fn
-
-MAX_SEQ = 300
-START_SEQ = 100
-SEQ_INCREMENT = 50
-
-
-def time_self_attention_primitives():
-    mx.random.seed(3)
-    B = 2
-    H = 38
-    D = 64
-    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
-        q = mx.random.uniform(shape=(B, H, R, D))
-        k = mx.random.uniform(shape=(B, H, R, D))
-        v = mx.random.uniform(shape=(B, H, R, D))
-        scale = 1.0 / math.sqrt(float(D))
-        mx.eval(q, k, v)
-
-        def sdpa_primitives(qs, ks, vs, alpha):
-            s = (alpha * qs) @ ks.transpose(0, 1, 3, 2)
-            p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
-            o = p @ vs
-            return o
-
-        time_fn(sdpa_primitives, q, k, v, scale)
-
-
-def time_self_attention_sdpa():
-    mx.random.seed(3)
-    B = 2
-    H = 38
-    D = 64
-    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
-        q = mx.random.uniform(shape=(B, H, R, D))
-        k = mx.random.uniform(shape=(B, H, R, D))
-        v = mx.random.uniform(shape=(B, H, R, D))
-        scale = 1.0 / math.sqrt(float(D))
-        mx.eval(q, k, v)
-
-        def sdpa_fused(qs, ks, vs, alpha):
-            o = mx.fast.scaled_dot_product_attention(qs, ks, vs, scale=alpha)
-            return o
-
-        time_fn(sdpa_fused, q, k, v, scale)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("MLX benchmarks.")
-    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
-    args = parser.parse_args()
-    if args.gpu:
-        mx.set_default_device(mx.gpu)
-    else:
-        mx.set_default_device(mx.cpu)
-
-    time_self_attention_sdpa()
-    time_self_attention_primitives()
--- a/cmake/metal.14.0.diff
+++ b/cmake/metal.14.0.diff
@@ -0,0 +1,36 @@
+diff -ur Metal/MTLEvent.hpp MetalNew/MTLEvent.hpp
+--- Metal/MTLEvent.hpp	2023-06-01 12:18:26
+++ MetalNew/MTLEvent.hpp	2024-04-15 07:36:59
+@@ -62,6 +62,7 @@
+ 
+     uint64_t                 signaledValue() const;
+     void                     setSignaledValue(uint64_t signaledValue);
+    bool                     waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS);
+ };
+ 
+ class SharedEventHandle : public NS::SecureCoding<SharedEventHandle>
+@@ -138,6 +139,11 @@
+ _MTL_INLINE void MTL::SharedEvent::setSignaledValue(uint64_t signaledValue)
+ {
+     Object::sendMessage<void>(this, _MTL_PRIVATE_SEL(setSignaledValue_), signaledValue);
+}
+
+// method: waitUntilSignaledValue
+_MTL_INLINE bool MTL::SharedEvent::waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS) {
+    return Object::sendMessage<bool>(this, _MTL_PRIVATE_SEL(waitUntilSignaledValue_timeoutMS_), signaledValue, timeoutMS);
+ }
+ 
+ // static method: alloc
+diff -ur Metal/MTLHeaderBridge.hpp MetalNew/MTLHeaderBridge.hpp
+--- Metal/MTLHeaderBridge.hpp	2023-06-01 12:18:26
+++ MetalNew/MTLHeaderBridge.hpp	2024-04-15 07:37:29
+@@ -1906,6 +1906,9 @@
+     "setShouldMaximizeConcurrentCompilation:");
+ _MTL_PRIVATE_DEF_SEL(setSignaledValue_,
+     "setSignaledValue:");
+_MTL_PRIVATE_DEF_SEL(
+    waitUntilSignaledValue_timeoutMS_,
+    "waitUntilSignaledValue:timeoutMS:");
+ _MTL_PRIVATE_DEF_SEL(setSize_,
+     "setSize:");
+ _MTL_PRIVATE_DEF_SEL(setSlice_,
--- a/cmake/metal.14.2.diff
+++ b/cmake/metal.14.2.diff
@@ -0,0 +1,36 @@
+diff -ur Metal/MTLEvent.hpp MetalNew/MTLEvent.hpp
+--- Metal/MTLEvent.hpp	2024-04-15 07:12:10
+++ MetalNew/MTLEvent.hpp	2024-04-15 07:15:50
+@@ -62,6 +62,7 @@
+ 
+     uint64_t                 signaledValue() const;
+     void                     setSignaledValue(uint64_t signaledValue);
+    bool                     waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS);
+ };
+ 
+ class SharedEventHandle : public NS::SecureCoding<SharedEventHandle>
+@@ -138,6 +139,11 @@
+ _MTL_INLINE void MTL::SharedEvent::setSignaledValue(uint64_t signaledValue)
+ {
+     Object::sendMessage<void>(this, _MTL_PRIVATE_SEL(setSignaledValue_), signaledValue);
+}
+
+// method: waitUntilSignaledValue
+_MTL_INLINE bool MTL::SharedEvent::waitUntilSignaledValue(uint64_t signaledValue, uint64_t timeoutMS) {
+    return Object::sendMessage<bool>(this, _MTL_PRIVATE_SEL(waitUntilSignaledValue_timeoutMS_), signaledValue, timeoutMS);
+ }
+ 
+ // static method: alloc
+diff -ur Metal/MTLHeaderBridge.hpp MetalNew/MTLHeaderBridge.hpp
+--- Metal/MTLHeaderBridge.hpp	2024-04-15 07:12:10
+++ MetalNew/MTLHeaderBridge.hpp	2024-04-15 07:16:15
+@@ -1918,6 +1918,9 @@
+     "setShouldMaximizeConcurrentCompilation:");
+ _MTL_PRIVATE_DEF_SEL(setSignaledValue_,
+     "setSignaledValue:");
+_MTL_PRIVATE_DEF_SEL(
+    waitUntilSignaledValue_timeoutMS_,
+    "waitUntilSignaledValue:timeoutMS:");
+ _MTL_PRIVATE_DEF_SEL(setSize_,
+     "setSize:");
+ _MTL_PRIVATE_DEF_SEL(setSlice_,
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,3 @@
 sphinx
 breathe
 sphinx-book-theme
-mlx
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -83,15 +83,3 @@ def setup(app):
 # -- Options for LaTeX output ------------------------------------------------

 latex_documents = [(main_doc, "MLX.tex", "MLX Documentation", author, "manual")]
-latex_elements = {
-    "preamble": r"""
-    \usepackage{enumitem}
-    \setlistdepth{5}
-    \setlist[itemize,1]{label=$\bullet$}
-    \setlist[itemize,2]{label=$\bullet$}
-    \setlist[itemize,3]{label=$\bullet$}
-    \setlist[itemize,4]{label=$\bullet$}
-    \setlist[itemize,5]{label=$\bullet$}
-    \renewlist{itemize}{itemize}{5}
-""",
-}
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -1,413 +0,0 @@
-Custom Metal Kernels
-====================
-
-MLX supports writing custom Metal kernels through the Python and C++ APIs.
-
-Simple Example
--------------
-
-Let's write a custom kernel that computes ``exp`` elementwise:
-
-.. code-block:: python
-
-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp",
-          source=source,
-      )
-      outputs = kernel(
-          inputs={"inp": a},
-          template={"T": mx.float32},
-          grid=(a.size, 1, 1),
-          threadgroup=(256, 1, 1),
-          output_shapes={"out": a.shape},
-          output_dtypes={"out": a.dtype},
-      )
-      return outputs["out"]
-
-  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
-  b = exp_elementwise(a)
-  assert mx.allclose(b, mx.exp(a))
-
-.. note::
-    We are only required to pass the body of the Metal kernel in ``source``.
-
-The full function signature will be generated using:
-
-* The keys and shapes/dtypes of ``inputs``
-    In the above, ``a`` is an ``mx.array`` of type ``mx.float16`` and we pass it with the key ``inp``
-    so we will add ``const device float16_t* inp`` to the signature.
-    ``inp_shape``, ``inp_strides`` and ``inp_ndim`` are also added for convenience if they are present
-    in ``source``.
-* The keys and values of ``output_shapes`` and ``output_dtypes``
-    In the above, ``out`` is an ``mx.array`` of type ``mx.float16``
-    so we add ``device float16_t* out``.
-* Template parameters passed using ``template``
-    In the above, ``template={"T": mx.float32}`` adds a template of ``template <typename T>`` to the function
-    and instantiates the template with ``custom_kernel_myexp_float<float>``.
-    Template parameters can be ``mx.core.Dtype``, ``int`` or ``bool``.
-* Metal attributes used in ``source`` such as ``[[thread_position_in_grid]]``
-    These will be added as function arguments.
-    All the attributes defined in Table 5.8 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ are supported.
-
-Putting this all together, the generated function signature for ``myexp`` is as follows:
-
-.. code-block:: cpp
-
-  template <typename T>
-  [[kernel]] void custom_kernel_myexp_float(
-    const device float16_t* inp [[buffer(0)]],
-    device float16_t* out [[buffer(1)]],
-    uint3 thread_position_in_grid [[thread_position_in_grid]]) {
-
-          uint elem = thread_position_in_grid.x;
-          T tmp = inp[elem];
-          out[elem] = metal::exp(tmp);
-
-  }
-
-  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;
-
-Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
-
-Using Shape/Strides
-------------------
-
-``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
-This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
-Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
-when indexing.
-
-If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
-input array ``a`` if any are present in ``source``.
-We can then use MLX's built in indexing utils to fetch the right elements for each thread.
-
-Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
-
-.. code-block:: python
-
-  def exp_elementwise(a: mx.array):
-      source = """
-          uint elem = thread_position_in_grid.x;
-          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
-          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
-          T tmp = inp[loc];
-          // Output arrays are always row contiguous
-          out[elem] = metal::exp(tmp);
-      """
-
-      kernel = mx.fast.metal_kernel(
-          name="myexp_strided",
-          source=source
-      )
-      outputs = kernel(
-          inputs={"inp": a},
-          template={"T": mx.float32},
-          grid=(a.size, 1, 1),
-          threadgroup=(256, 1, 1),
-          output_shapes={"out": a.shape},
-          output_dtypes={"out": a.dtype},
-          ensure_row_contiguous=False,
-      )
-      return outputs["out"]
-
-  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
-  # make non-contiguous
-  a = a[::2]
-  b = exp_elementwise(a)
-  assert mx.allclose(b, mx.exp(a))
-
-Complex Example
-----------------------------
-
-Let's implement a more complex example: ``grid_sample`` in ``"bilinear"`` mode.
-
-We'll start with the following MLX implementation using standard ops:
-
-.. code-block:: python
-
-    def grid_sample_ref(x, grid):
-        N, H_in, W_in, _ = x.shape
-        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
-        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
-
-        ix_nw = mx.floor(ix).astype(mx.int32)
-        iy_nw = mx.floor(iy).astype(mx.int32)
-
-        ix_ne = ix_nw + 1
-        iy_ne = iy_nw
-
-        ix_sw = ix_nw
-        iy_sw = iy_nw + 1
-
-        ix_se = ix_nw + 1
-        iy_se = iy_nw + 1
-
-        nw = (ix_se - ix)    * (iy_se - iy)
-        ne = (ix    - ix_sw) * (iy_sw - iy)
-        sw = (ix_ne - ix)    * (iy    - iy_ne)
-        se = (ix    - ix_nw) * (iy    - iy_nw)
-
-        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
-        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
-        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
-        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
-
-        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
-        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
-        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
-        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
-
-        I_nw *= mask_nw[..., None]
-        I_ne *= mask_ne[..., None]
-        I_sw *= mask_sw[..., None]
-        I_se *= mask_se[..., None]
-
-        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
-
-        return output
-
-Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
-to write a fast GPU kernel for both the forward and backward passes.
-
-First we'll implement the forward pass as a fused kernel:
-
-.. code-block:: python
-
-    @mx.custom_function
-    def grid_sample(x, grid):
-
-        assert x.ndim == 4, "`x` must be 4D."
-        assert grid.ndim == 4, "`grid` must be 4D."
-
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-        out_shape = (B, gN, gM, C)
-
-        assert D == 2, "Last dim of `grid` must be size 2."
-
-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
-
-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
-
-            uint grid_idx = elem / C * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
-
-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
-
-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
-
-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
-
-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
-
-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            int batch_idx = elem / C / gH / gW * b_stride;
-            int channel_idx = elem % C;
-            int base_idx = batch_idx + channel_idx;
-
-            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
-            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
-            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
-            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
-
-            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
-            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
-            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
-            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
-
-            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample",
-            source=source,
-        )
-        outputs = kernel(
-            inputs={"x": x, "grid": grid},
-            template={"T": x.dtype},
-            output_shapes={"out": out_shape},
-            output_dtypes={"out": x.dtype},
-            grid=(np.prod(out_shape), 1, 1),
-            threadgroup=(256, 1, 1),
-        )
-        return outputs["out"]
-
-For a reasonably sized input such as:
-
-.. code-block:: python
-
-    x.shape = (8, 1024, 1024, 64)
-    grid.shape = (8, 256, 256, 2)
-
-On an M1 Max, we see a big performance improvement:
-
-``55.7ms -> 6.7ms => 8x speed up``
-
-Grid Sample VJP
---------------
-
-Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
-its custom vjp transform so MLX can differentiate it.
-
-The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
-requires a few extra ``mx.fast.metal_kernel`` features:
-
-* ``init_value=0``
-    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
-
-* ``atomic_outputs=True``
-    Designate all of the kernel outputs as ``atomic`` in the function signature. 
-    This means we can use Metal's ``atomic`` features to simultaneously update the ``x_grad`` and ``grid_grad`` arrays from multiple threadgroups. 
-    See section 6.15 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ for more details.
-
-We can then implement the backwards pass as follows:
-
-.. code-block:: python
-
-    @grid_sample.vjp
-    def grid_sample_vjp(primals, cotangent, _):
-        x, grid = primals
-        B, _, _, C = x.shape
-        _, gN, gM, D = grid.shape
-
-        assert D == 2, "Last dim of `grid` must be size 2."
-
-        source = """
-            uint elem = thread_position_in_grid.x;
-            int H = x_shape[1];
-            int W = x_shape[2];
-            int C = x_shape[3];
-            // Pad C to the nearest larger simdgroup size multiple
-            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
-
-            int gH = grid_shape[1];
-            int gW = grid_shape[2];
-
-            int w_stride = C;
-            int h_stride = W * w_stride;
-            int b_stride = H * h_stride;
-
-            uint grid_idx = elem / C_padded * 2;
-            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
-            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
-
-            int ix_nw = floor(ix);
-            int iy_nw = floor(iy);
-
-            int ix_ne = ix_nw + 1;
-            int iy_ne = iy_nw;
-
-            int ix_sw = ix_nw;
-            int iy_sw = iy_nw + 1;
-
-            int ix_se = ix_nw + 1;
-            int iy_se = iy_nw + 1;
-
-            T nw = (ix_se - ix)    * (iy_se - iy);
-            T ne = (ix    - ix_sw) * (iy_sw - iy);
-            T sw = (ix_ne - ix)    * (iy    - iy_ne);
-            T se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            int batch_idx = elem / C_padded / gH / gW * b_stride;
-            int channel_idx = elem % C_padded;
-            int base_idx = batch_idx + channel_idx;
-
-            T gix = T(0);
-            T giy = T(0);
-            if (channel_idx < C) {
-                int cot_index = elem / C_padded * C + channel_idx;
-                T cot = cotangent[cot_index];
-                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
-                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
-
-                    T I_nw = x[offset];
-                    gix -= I_nw * (iy_se - iy) * cot;
-                    giy -= I_nw * (ix_se - ix) * cot;
-                }
-                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
-                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
-
-                    T I_ne = x[offset];
-                    gix += I_ne * (iy_sw - iy) * cot;
-                    giy -= I_ne * (ix - ix_sw) * cot;
-                }
-                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
-                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
-
-                    T I_sw = x[offset];
-                    gix -= I_sw * (iy - iy_ne) * cot;
-                    giy += I_sw * (ix_ne - ix) * cot;
-                }
-                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
-                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
-                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
-
-                    T I_se = x[offset];
-                    gix += I_se * (iy - iy_nw) * cot;
-                    giy += I_se * (ix - ix_nw) * cot;
-                }
-            }
-
-            T gix_mult = W / 2;
-            T giy_mult = H / 2;
-
-            // Reduce across each simdgroup first.
-            // This is much faster than relying purely on atomics.
-            gix = simd_sum(gix);
-            giy = simd_sum(giy);
-
-            if (thread_index_in_simdgroup == 0) {
-                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
-                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
-            }
-        """
-        kernel = mx.fast.metal_kernel(
-            name="grid_sample_grad",
-            source=source,
-            atomic_outputs=True,
-        )
-        # pad the output channels to simd group size
-        # so that our `simd_sum`s don't overlap.
-        simdgroup_size = 32
-        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
-        grid_size = B * gN * gM * C_padded
-        outputs = kernel(
-            inputs={"x": x, "grid": grid, "cotangent": cotangent},
-            template={"T": x.dtype},
-            output_shapes={"x_grad": x.shape, "grid_grad": grid.shape},
-            output_dtypes={"x_grad": x.dtype, "grid_grad": x.dtype},
-            grid=(grid_size, 1, 1),
-            threadgroup=(256, 1, 1),
-            init_value=0,
-        )
-        return outputs["x_grad"], outputs["grid_grad"]
-
-There's an even larger speed up for the vjp:
-
-``676.4ms -> 16.7ms => 40x speed up``
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -1,5 +1,5 @@
-Custom Extensions in MLX
-========================
+Developer Documentation
+=======================

 You can extend MLX with custom operations on the CPU or GPU. This guide
 explains how to do that with a simple example.
@@ -486,14 +486,15 @@ below.
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);

-        // Make sure the metal library is available
-        d.register_library("mlx_ext");
+        // Make sure the metal library is available and look for it
+        // in the same folder as this executable if needed
+        d.register_library("mlx_ext", metal::get_colocated_mtllib_path);

        // Make a kernel from this metal library
        auto kernel = d.get_kernel(kname.str(), "mlx_ext");

        // Prepare to encode kernel
-        auto& compute_encoder = d.get_command_encoder(s.index);
+        auto compute_encoder = d.get_command_encoder(s.index);
        compute_encoder->setComputePipelineState(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
@@ -502,11 +503,11 @@ below.
        size_t nelem = out.size();

        // Encode input arrays to kernel
-        compute_encoder.set_input_array(x, 0);
-        compute_encoder.set_input_array(y, 1);
+        set_array_buffer(compute_encoder, x, 0);
+        set_array_buffer(compute_encoder, y, 1);

        // Encode output arrays to kernel
-        compute_encoder.set_output_array(out, 2);
+        set_array_buffer(compute_encoder, out, 2);

        // Encode alpha and beta
        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -530,7 +531,7 @@ below.

        // Launch the grid with the given number of threads divided among
        // the given threadgroups
-        compute_encoder.dispatchThreads(grid_dims, group_dims);
+        compute_encoder->dispatchThreads(grid_dims, group_dims);
    }

 We can now call the :meth:`axpby` operation on both the CPU and the GPU!
@@ -824,7 +825,7 @@ Let's look at a simple script and its results:

    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
-    print(f"c correct: {mx.all(c == 6.0).item()}")
+    print(f"c correctness: {mx.all(c == 6.0).item()}")

 Output:

--- a/docs/src/examples/llama-inference.rst
+++ b/docs/src/examples/llama-inference.rst
@@ -15,7 +15,7 @@ module to concisely define the model architecture.
 Attention layer
 ^^^^^^^^^^^^^^^^

-We will start with the Llama attention layer which notably uses the RoPE
+We will start with the llama attention layer which notably uses the RoPE
 positional encoding. [1]_ In addition, our attention layer will optionally use a
 key/value cache that will be concatenated with the provided keys and values to
 support efficient inference.
--- a/docs/src/examples/mlp.rst
+++ b/docs/src/examples/mlp.rst
@@ -64,7 +64,7 @@ set:
 Next, setup the problem parameters and load the data. To load the data, you need our
 `mnist data loader
 <https://github.com/ml-explore/mlx-examples/blob/main/mnist/mnist.py>`_, which
-we will import as ``mnist``.
+we will import as `mnist`.

 .. code-block:: python

--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -43,7 +43,6 @@ are the CPU and GPU.
   usage/function_transforms
   usage/compile
   usage/numpy
-   usage/distributed
   usage/using_streams

 .. toctree::
@@ -70,7 +69,6 @@ are the CPU and GPU.
   python/metal
   python/nn
   python/optimizers
-   python/distributed
   python/tree_utils

 .. toctree::
@@ -85,4 +83,3 @@ are the CPU and GPU.

   dev/extensions
   dev/metal_debugger
-   dev/custom_metal_kernels
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -70,36 +70,36 @@ To build and install the MLX python library from source, first, clone MLX from

   git clone git@github.com:ml-explore/mlx.git mlx && cd mlx

+Install `nanobind <https://nanobind.readthedocs.io/en/latest/>`_ with:
+
+.. code-block:: shell
+
+    pip install git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4
+
 Then simply build and install MLX using pip:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
+   env CMAKE_BUILD_PARALLEL_LEVEL="" pip install .

-For developing, install the package with development dependencies, and use an
-editable install:
+For developing use an editable install:

 .. code-block:: shell

-  CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e ".[dev]"
+  env CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e .

-Once the development dependencies are installed, you can build faster with:
-
-.. code-block:: shell
-
- CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext -j --inplace
-
-Run the tests with:
+To make sure the install is working run the tests with:

 .. code-block:: shell

+  pip install ".[testing]"
  python -m unittest discover python/tests

-Optional: Install stubs to enable auto completions and type checking from your
-IDE:
+Optional: Install stubs to enable auto completions and type checking from your IDE:

 .. code-block:: shell

+  pip install ".[dev]"
  python setup.py generate_stubs

 C++ API
@@ -153,18 +153,11 @@ should point to the path to the built metal library.
     - OFF
   * - MLX_BUILD_METAL
     - ON
-   * - MLX_BUILD_CPU
-     - ON
   * - MLX_BUILD_PYTHON_BINDINGS
     - OFF
   * - MLX_METAL_DEBUG
     - OFF
-   * - MLX_BUILD_SAFETENSORS
-     - ON
-   * - MLX_BUILD_GGUF
-     - ON
-   * - MLX_METAL_JIT
-     - OFF
+

 .. note::

@@ -183,37 +176,10 @@ should point to the path to the built metal library.

      xcrun -sdk macosx --show-sdk-version

-Binary Size Minimization
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-To produce a smaller binary use the CMake flags ``CMAKE_BUILD_TYPE=MinSizeRel``
-and ``BUILD_SHARED_LIBS=ON``.
-
-The MLX CMake build has several additional options to make smaller binaries.
-For example, if you don't need the CPU backend or support for safetensors and
-GGUF, you can do:
-
-.. code-block:: shell
-
-  cmake .. \
-    -DCMAKE_BUILD_TYPE=MinSizeRel \
-    -DBUILD_SHARED_LIBS=ON \
-    -DMLX_BUILD_CPU=OFF \
-    -DMLX_BUILD_SAFETENSORS=OFF \
-    -DMLX_BUILD_GGUF=OFF \
-    -DMLX_METAL_JIT=ON
-
-THE ``MLX_METAL_JIT`` flag minimizes the size of the MLX Metal library which
-contains pre-built GPU kernels. This substantially reduces the size of the
-Metal library by run-time compiling kernels the first time they are used in MLX
-on a given machine. Note run-time compilation incurs a cold-start cost which can
-be anwywhere from a few hundred millisecond to a few seconds depending on the
-application. Once a kernel is compiled, it will be cached by the system. The
-Metal kernel cache persists accross reboots.
-
 Troubleshooting
 ^^^^^^^^^^^^^^^

+
 Metal not found
 ~~~~~~~~~~~~~~~

--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -24,7 +24,6 @@ Array
    array.any
    array.argmax
    array.argmin
-    array.conj
    array.cos
    array.cummax
    array.cummin
@@ -58,4 +57,3 @@ Array
    array.transpose
    array.T
    array.var
-    array.view
--- a/docs/src/python/distributed.rst
+++ b/docs/src/python/distributed.rst
@@ -1,19 +0,0 @@
-.. _distributed:
-
-.. currentmodule:: mlx.core.distributed
-
-Distributed Communication
-==========================
-
-MLX provides a distributed communication package using MPI. The MPI library is
-loaded at runtime; if MPI is available then distributed communication is also
-made available.
-
-.. autosummary::
-   :toctree: _autosummary
-
-    Group
-    is_available
-    init
-    all_sum
-    all_gather
--- a/docs/src/python/fast.rst
+++ b/docs/src/python/fast.rst
@@ -12,5 +12,3 @@ Fast
  layer_norm
  rope
  scaled_dot_product_attention
-  affine_quantize
-  metal_kernel
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -8,10 +8,5 @@ Linear Algebra
 .. autosummary:: 
   :toctree: _autosummary 

-    inv
-    tri_inv
    norm
-    cholesky
-    cholesky_inv
    qr
-    svd
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -17,8 +17,6 @@ simple functions.
   gelu_approx
   gelu_fast_approx
   glu
-   hard_shrink
-   hard_tanh
   hardswish
   leaky_relu
   log_sigmoid
@@ -31,7 +29,6 @@ simple functions.
   sigmoid
   silu
   softmax
-   softmin
   softplus
   softshrink
   step
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -15,21 +15,15 @@ Layers
   BatchNorm
   Conv1d
   Conv2d
-   Conv3d
   Dropout
   Dropout2d
   Dropout3d
   Embedding
   GELU
-   GLU
   GroupNorm
   GRU
-   HardShrink
-   HardTanh
-   Hardswish
   InstanceNorm
   LayerNorm
-   LeakyReLU
   Linear
   LSTM
   MaxPool1d
@@ -41,19 +35,13 @@ Layers
   QuantizedLinear
   RMSNorm
   ReLU
-   ReLU6
   RNN
   RoPE
   SELU
   Sequential
   SiLU
   SinusoidalPositionalEncoding
-   Softmin
   Softshrink
-   Softsign
-   Softmax
-   Softplus
   Step
-   Tanh
   Transformer
   Upsample
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -10,7 +10,6 @@ Operations

   abs
   add
-   addmm
   all
   allclose
   any
@@ -20,14 +19,12 @@ Operations
   arcsin
   arcsinh
   arctan
-   arctan2
   arctanh
   argmax
   argmin
   argpartition
   argsort
   array_equal
-   as_strided
   atleast_1d
   atleast_2d
   atleast_3d
@@ -35,16 +32,14 @@ Operations
   bitwise_or
   bitwise_xor
   block_masked_mm
+   block_sparse_mm
   broadcast_to
   ceil
   clip
   concatenate
-   conj
-   conjugate
   convolve
   conv1d
   conv2d
-   conv3d
   conv_general
   cos
   cosh
@@ -58,8 +53,6 @@ Operations
   diagonal
   divide
   divmod
-   einsum
-   einsum_path
   equal
   erf
   erfinv
@@ -71,11 +64,8 @@ Operations
   floor
   floor_divide
   full
-   gather_mm
-   gather_qmm
   greater
   greater_equal
-   hadamard_transform
   identity
   inner
   isclose
@@ -83,7 +73,6 @@ Operations
   isnan
   isneginf
   isposinf
-   issubdtype
   left_shift
   less
   less_equal
@@ -107,7 +96,6 @@ Operations
   minimum
   moveaxis
   multiply
-   nan_to_num
   negative
   not_equal
   ones
@@ -115,13 +103,11 @@ Operations
   outer
   partition
   pad
-   power
   prod
   quantize
   quantized_matmul
   radians
   reciprocal
-   remainder
   repeat
   reshape
   right_shift
@@ -155,13 +141,11 @@ Operations
   tensordot
   tile
   topk
-   trace
   transpose
   tri
   tril
   triu
   var
-   view
   where
   zeros
   zeros_like
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -31,41 +31,6 @@ model's parameters and the **optimizer state**.
            # Compute the new parameters but also the optimizer state.
            mx.eval(model.parameters(), optimizer.state)

-Saving and Loading
------------------
-
-To serialize an optimizer, save its state. To load an optimizer, load and set
-the saved state. Here's a simple example:
-
-.. code-block:: python
-
-   import mlx.core as mx
-   from mlx.utils import tree_flatten, tree_unflatten
-   import mlx.optimizers as optim
-
-   optimizer = optim.Adam(learning_rate=1e-2)
-
-   # Perform some updates with the optimizer
-   model = {"w" : mx.zeros((5, 5))}
-   grads = {"w" : mx.ones((5, 5))}
-   optimizer.update(model, grads)
-
-   # Save the state
-   state = tree_flatten(optimizer.state)
-   mx.save_safetensors("optimizer.safetensors", dict(state))
-
-   # Later on, for example when loading from a checkpoint,
-   # recreate the optimizer and load the state
-   optimizer = optim.Adam(learning_rate=1e-2)
-
-   state = tree_unflatten(list(mx.load("optimizer.safetensors").items()))
-   optimizer.state = state
-
-Note, not every optimizer configuation parameter is saved in the state. For
-example, for Adam the learning rate is saved but the ``betas`` and ``eps``
-parameters are not. A good rule of thumb is if the parameter can be scheduled
-then it will be included in the optimizer state.
-
 .. toctree::

   optimizers/optimizer
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -44,4 +44,3 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
   split
   truncated_normal
   uniform
-   laplace
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -10,7 +10,6 @@ Transforms

   eval
   compile
-   custom_function
   disable_compile
   enable_compile
   grad
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -1,166 +0,0 @@
-.. _usage_distributed:
-
-Distributed Communication
-=========================
-
-.. currentmodule:: mlx.core.distributed
-
-MLX utilizes `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ to
-provide distributed communication operations that allow the computational cost
-of training or inference to be shared across many physical machines. You can
-see a list of the supported operations in the :ref:`API docs<distributed>`.
-
-.. note::
-   A lot of operations may not be supported or not as fast as they should be.
-   We are adding more and tuning the ones we have as we are figuring out the
-   best way to do distributed computing on Macs using MLX.
-
-Getting Started
---------------
-
-MLX already comes with the ability to "talk" to MPI if it is installed on the
-machine. The minimal distributed program in MLX is as simple as:
-
-.. code:: python
-
-    import mlx.core as mx
-
-    world = mx.distributed.init()
-    x = mx.distributed.all_sum(mx.ones(10))
-    print(world.rank(), x)
-
-The program above sums the array ``mx.ones(10)`` across all
-distributed processes. If simply run with ``python``, however, only one
-process is launched and no distributed communication takes place.
-
-To launch the program in distributed mode we need to use ``mpirun`` or
-``mpiexec`` depending on the MPI installation. The simplest possible way is the
-following:
-
-.. code:: shell
-
-    $ mpirun -np 2 python test.py
-    1 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
-    0 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
-
-The above launches two processes on the same (local) machine and we can see
-both standard output streams. The processes send the array of 1s to each other
-and compute the sum which is printed. Launching with ``mpirun -np 4 ...`` would
-print 4 etc.
-
-Installing MPI
---------------
-
-MPI can be installed with Homebrew, using the Anaconda package manager or
-compiled from source. Most of our testing is done using ``openmpi`` installed
-with the Anaconda package manager as follows:
-
-.. code:: shell
-
-    $ conda install openmpi
-
-Installing with Homebrew may require specifying the location of ``libmpi.dyld``
-so that MLX can find it and load it at runtime. This can simply be achieved by
-passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun``.
-
-.. code:: shell
-
-    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
-
-Setting up Remote Hosts
-----------------------
-
-MPI can automatically connect to remote hosts and set up the communication over
-the network if the remote hosts can be accessed via ssh. A good checklist to
-debug connectivity issues is the following:
-
-* ``ssh hostname`` works from all machines to all machines without asking for
-  password or host confirmation
-* ``mpirun`` is accessible on all machines. You can call ``mpirun`` using its
-  full path to force all machines to use a specific path.
-* Ensure that the ``hostname`` used by MPI is the one that you have configured
-  in the ``.ssh/config`` files on all machines.
-
-.. note::
-  For an example hostname ``foo.bar.com`` MPI can use only ``foo`` as
-  the hostname passed to ssh if the current hostname matches ``*.bar.com``.
-
-An easy way to pass the host names to MPI is using a host file. A host file
-looks like the following, where ``host1`` and ``host2`` should be the fully
-qualified domain names or IPs for these hosts.
-
-.. code::
-
-    host1 slots=1
-    host2 slots=1
-
-When using MLX, it is very likely that you want to use 1 slot per host, ie one
-process per host.  The hostfile also needs to contain the current
-host if you want to run on the local host. Passing the host file to
-``mpirun`` is simply done using the ``--hostfile`` command line argument.
-
-Training Example
----------------
-
-In this section we will adapt an MLX training loop to support data parallel
-distributed training. Namely, we will average the gradients across a set of
-hosts before applying them to the model.
-
-Our training loop looks like the following code snippet if we omit the model,
-dataset and optimizer initialization.
-
-.. code:: python
-
-    model = ...
-    optimizer = ...
-    dataset = ...
-
-    def step(model, x, y):
-        loss, grads = loss_grad_fn(model, x, y)
-        optimizer.update(model, grads)
-        return loss
-
-    for x, y in dataset:
-        loss = step(model, x, y)
-        mx.eval(loss, model.parameters())
-
-All we have to do to average the gradients across machines is perform an
-:func:`all_sum` and divide by the size of the :class:`Group`. Namely we
-have to :func:`mlx.utils.tree_map` the gradients with following function.
-
-.. code:: python
-
-    def all_avg(x):
-        return mx.distributed.all_sum(x) / mx.distributed.init().size()
-
-Putting everything together our training loop step looks as follows with
-everything else remaining the same.
-
-.. code:: python
-
-    from mlx.utils import tree_map
-
-    def all_reduce_grads(grads):
-        N = mx.distributed.init()
-        if N == 1:
-            return grads
-        return tree_map(
-                lambda x: mx.distributed.all_sum(x) / N,
-                grads)
-
-    def step(model, x, y):
-        loss, grads = loss_grad_fn(model, x, y)
-        grads = all_reduce_grads(grads)  # <--- This line was added
-        optimizer.update(model, grads)
-        return loss
-
-Tuning All Reduce
-----------------
-
-We are working on improving the performance of all reduce on MLX but for now
-the two main things one can do to extract the most out of distributed training with MLX are:
-
-1. Perform a few large reductions instead of many small ones to improve
-   bandwidth and latency
-2. Pass ``--mca btl_tcp_links 4`` to ``mpirun`` to configure it to use 4 tcp
-   connections between each host to improve bandwidth
--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -3,11 +3,7 @@
 Conversion to NumPy and Other Frameworks
 ========================================

-MLX array supports conversion between other frameworks with either:  
-
-* The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_. 
-* `DLPack <https://dmlc.github.io/dlpack/latest/>`_.  
-
+MLX array implements the `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
 Let's convert an array to NumPy and back.

 .. code-block:: python
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -9,4 +9,3 @@ build_example(tutorial.cpp)
 build_example(linear_regression.cpp)
 build_example(logistic_regression.cpp)
 build_example(metal_capture.cpp)
-build_example(distributed.cpp)
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -1,22 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <iostream>
-
-#include "mlx/mlx.h"
-
-using namespace mlx::core;
-
-int main() {
-  if (!distributed::is_available()) {
-    std::cout << "No communication backend found" << std::endl;
-    return 1;
-  }
-
-  auto global_group = distributed::init();
-  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;
-
-  array x = ones({10});
-  array out = distributed::all_sum(x, global_group);
-
-  std::cout << out << std::endl;
-}
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -89,8 +89,8 @@ void automatic_differentiation() {
  // dfdx is 2 * x

  // Get the second derivative by composing grad with grad
-  auto d2fdx2 = grad(grad(fn))(x);
-  // d2fdx2 is 2
+  auto df2dx2 = grad(grad(fn))(x);
+  // df2dx2 is 2
 }

 int main() {
--- a/examples/extensions/README.md
+++ b/examples/extensions/README.md
@@ -1,5 +1,5 @@

-## Build
+## Build the extensions

 ```
 pip install -e .
@@ -16,9 +16,3 @@ And then run:
 ```
 python setup.py build_ext -j8 --inplace
 ```
-
-## Test
-
-```
-python test.py
-```
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -249,14 +249,15 @@ void Axpby::eval_gpu(
  kname << (contiguous_kernel ? "contiguous_" : "general_");
  kname << type_to_name(out);

-  // Make sure the metal library is available
-  d.register_library("mlx_ext");
+  // Make sure the metal library is available and look for it
+  // in the same folder as this executable if needed
+  d.register_library("mlx_ext", metal::get_colocated_mtllib_path);

  // Make a kernel from this metal library
  auto kernel = d.get_kernel(kname.str(), "mlx_ext");

  // Prepare to encode kernel
-  auto& compute_encoder = d.get_command_encoder(s.index);
+  auto compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);

  // Kernel parameters are registered with buffer indices corresponding to
@@ -265,11 +266,11 @@ void Axpby::eval_gpu(
  size_t nelem = out.size();

  // Encode input arrays to kernel
-  compute_encoder.set_input_array(x, 0);
-  compute_encoder.set_input_array(y, 1);
+  set_array_buffer(compute_encoder, x, 0);
+  set_array_buffer(compute_encoder, y, 1);

  // Encode output arrays to kernel
-  compute_encoder.set_output_array(out, 2);
+  set_array_buffer(compute_encoder, out, 2);

  // Encode alpha and beta
  compute_encoder->setBytes(&alpha_, sizeof(float), 3);
@@ -295,7 +296,7 @@ void Axpby::eval_gpu(

  // Launch the grid with the given number of threads divided among
  // the given threadgroups
-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder->dispatchThreads(grid_dims, group_dims);
 }

 #else // Metal is not available
--- a/examples/extensions/mlx_sample_extensions/init.py
+++ b/examples/extensions/mlx_sample_extensions/init.py
@@ -2,4 +2,4 @@

 import mlx.core as mx

-from ._ext import axpby
+from .mlx_sample_extensions import *
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
  "setuptools>=42",
  "cmake>=3.24",
-  "mlx>=0.17.0",
-  "nanobind==2.1.0",
+  "mlx>=0.9.0",
+  "nanobind@git+https://github.com/wjakob/nanobind.git@2f04eac452a6d9142dedb957701bdb20125561e4",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.24
-mlx>=0.17.0
-nanobind==2.1.0
+mlx>=0.9.0
+nanobind@git+https://github.com/wjakob/nanobind.git#egg=4148debcf91f5ccab0c3b8d67b5c3cabd61f407f
--- a/examples/extensions/setup.py
+++ b/examples/extensions/setup.py
@@ -13,6 +13,7 @@ if __name__ == "__main__":
        cmdclass={"build_ext": extension.CMakeBuild},
        packages=["mlx_sample_extensions"],
        package_data={"mlx_sample_extensions": ["*.so", "*.dylib", "*.metallib"]},
+        extras_require={"dev": []},
        zip_safe=False,
        python_requires=">=3.8",
    )
--- a/examples/extensions/test.py
+++ b/examples/extensions/test.py
@@ -1,10 +0,0 @@
-import mlx.core as mx
-from mlx_sample_extensions import axpby
-
-a = mx.ones((3, 4))
-b = mx.ones((3, 4))
-c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
-
-print(f"c shape: {c.shape}")
-print(f"c dtype: {c.dtype}")
-print(f"c correct: {mx.all(c == 6.0).item()}")
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -6,7 +6,6 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
@@ -20,17 +19,12 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h
 )

-if (MLX_BUILD_CPU)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
-else()
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
-endif()
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/io)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
 if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
-elseif(MLX_BUILD_CPU)
+else()
  target_sources(
    mlx
    PRIVATE
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -17,10 +17,6 @@ bool in_tracing() {
  return detail::InTracing::in_tracing();
 }

-bool retain_graph() {
-  return detail::RetainGraph::retain_graph();
-}
-
 } // namespace

 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
@@ -106,7 +102,7 @@ void array::eval() {
 }

 bool array::is_tracer() const {
-  return array_desc_->is_tracer && in_tracing() || retain_graph();
+  return array_desc_->is_tracer && in_tracing();
 }

 void array::set_data(allocator::Buffer buffer, deleter_t d) {
@@ -175,11 +171,10 @@ array::~array() {
    return;
  }

-  // Ignore arrays that might be detached during eval
-  if (status() == array::Status::scheduled) {
+  // Ignore arrays that will be detached
+  if (status() != array::Status::unscheduled) {
    return;
  }
-
  // Break circular reference for non-detached arrays with siblings
  if (auto n = siblings().size(); n > 0) {
    bool do_detach = true;
@@ -211,7 +206,7 @@ void array::ArrayDesc::init() {
    strides[i] = size;
    size *= shape[i];
  }
-  for (const auto& in : inputs) {
+  for (auto& in : inputs) {
    is_tracer |= in.is_tracer();
  }
 }
@@ -236,7 +231,7 @@ array::ArrayDesc::ArrayDesc(

 array::ArrayDesc::~ArrayDesc() {
  // When an array description is destroyed it will delete a bunch of arrays
-  // that may also destroy their corresponding descriptions and so on and so
+  // that may also destory their corresponding descriptions and so on and so
  // forth.
  //
  // This calls recursively the destructor and can result in stack overflow, we
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -73,32 +73,32 @@ class array {
      this->array_desc_ = other.array_desc_;
    }
    return *this;
-  }
+  };

  /** The size of the array's datatype in bytes. */
  size_t itemsize() const {
    return size_of(dtype());
-  }
+  };

  /** The number of elements in the array. */
  size_t size() const {
    return array_desc_->size;
-  }
+  };

  /** The number of bytes in the array. */
  size_t nbytes() const {
    return size() * itemsize();
-  }
+  };

  /** The number of dimensions of the array. */
  size_t ndim() const {
    return array_desc_->shape.size();
-  }
+  };

  /** The shape of the array as a vector of integers. */
  const std::vector<int>& shape() const {
    return array_desc_->shape;
-  }
+  };

  /**
   *  Get the size of the corresponding dimension.
@@ -107,12 +107,12 @@ class array {
   *  bounds checking. */
  int shape(int dim) const {
    return shape().at(dim < 0 ? dim + ndim() : dim);
-  }
+  };

  /** The strides of the array. */
  const std::vector<size_t>& strides() const {
    return array_desc_->strides;
-  }
+  };

  /**
   *  Get the stride of the corresponding dimension.
@@ -121,12 +121,12 @@ class array {
   *  bounds checking. */
  size_t strides(int dim) const {
    return strides().at(dim < 0 ? dim + ndim() : dim);
-  }
+  };

  /** Get the arrays data type. */
  Dtype dtype() const {
    return array_desc_->dtype;
-  }
+  };

  /** Evaluate the array. */
  void eval();
@@ -160,10 +160,10 @@ class array {

    friend bool operator==(const ArrayIterator& a, const ArrayIterator& b) {
      return a.arr.id() == b.arr.id() && a.idx == b.idx;
-    }
+    };
    friend bool operator!=(const ArrayIterator& a, const ArrayIterator& b) {
      return !(a == b);
-    }
+    };

   private:
    const array& arr;
@@ -209,7 +209,7 @@ class array {
    allocator::Buffer buffer;
    deleter_t d;
    Data(allocator::Buffer buffer, deleter_t d = allocator::free)
-        : buffer(buffer), d(d) {}
+        : buffer(buffer), d(d) {};
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
@@ -230,22 +230,22 @@ class array {
  /** The array's primitive. */
  Primitive& primitive() const {
    return *(array_desc_->primitive);
-  }
+  };

  /** A shared pointer to the array's primitive. */
  std::shared_ptr<Primitive>& primitive_ptr() const {
    return array_desc_->primitive;
-  }
+  };

  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
-  }
+  };

  /** The array's inputs. */
  const std::vector<array>& inputs() const {
    return array_desc_->inputs;
-  }
+  };

  std::vector<array>& inputs() {
    return array_desc_->inputs;
@@ -259,12 +259,12 @@ class array {
  /** The array's siblings. */
  const std::vector<array>& siblings() const {
    return array_desc_->siblings;
-  }
+  };

  /** The array's siblings. */
  std::vector<array>& siblings() {
    return array_desc_->siblings;
-  }
+  };

  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
@@ -281,7 +281,7 @@ class array {
    outputs.push_back(*this);
    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
    return outputs;
-  }
+  };

  /** Detach the array from the graph. */
  void detach();
@@ -289,19 +289,19 @@ class array {
  /** Get the Flags bit-field. */
  const Flags& flags() const {
    return array_desc_->flags;
-  }
+  };

  /** The size (in elements) of the underlying buffer the array points to. */
  size_t data_size() const {
    return array_desc_->data_size;
-  }
+  };

  allocator::Buffer& buffer() {
    return array_desc_->data->buffer;
-  }
+  };
  const allocator::Buffer& buffer() const {
    return array_desc_->data->buffer;
-  }
+  };

  // Return a copy of the shared pointer
  // to the array::Data struct
@@ -312,20 +312,19 @@ class array {
  template <typename T>
  T* data() {
    return static_cast<T*>(array_desc_->data_ptr);
-  }
+  };

  template <typename T>
  const T* data() const {
    return static_cast<T*>(array_desc_->data_ptr);
-  }
+  };

  enum Status { unscheduled, scheduled, available };

  bool is_available() const {
    return status() == Status::available;
  }
-
-  Status status() const {
+  const Status status() const {
    return array_desc_->status;
  }

--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@@ -1,9 +1,9 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2023 Apple Inc.

 #include <cassert>

-#include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
+#include <vecLib/vDSP.h>

 #include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -2,7 +2,8 @@

 #include <cassert>

-#include <Accelerate/Accelerate.h>
+#include <vecLib/BNNS/bnns.h>
+#include <vecLib/cblas_new.h>

 #include "mlx/backend/accelerate/utils.h"
 #include "mlx/backend/common/copy.h"
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -3,7 +3,8 @@
 #include <cassert>
 #include <cmath>

-#include <Accelerate/Accelerate.h>
+#include <vecLib/vDSP.h>
+#include <vecLib/vForce.h>

 #include "mlx/allocator.h"
 #include "mlx/backend/common/binary.h"
@@ -31,12 +32,12 @@ DEFAULT(ArgReduce)
 DEFAULT(ArgSort)
 DEFAULT(AsStrided)
 DEFAULT(BlockMaskedMM)
+DEFAULT(BlockSparseMM)
 DEFAULT(Broadcast)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
-DEFAULT(Conjugate)
 DEFAULT(Copy)
-DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(CustomVJP)
 DEFAULT_MULTI(Depends)
 DEFAULT_MULTI(DivMod)
 DEFAULT(NumberOfElements)
@@ -46,14 +47,10 @@ DEFAULT(ErfInv)
 DEFAULT(FFT)
 DEFAULT(Floor)
 DEFAULT(Gather)
-DEFAULT(GatherMM)
-DEFAULT(GatherQMM)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
-DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
-DEFAULT(Load)
 DEFAULT(LogicalNot)
 DEFAULT(LogicalAnd)
 DEFAULT(LogicalOr)
@@ -80,7 +77,6 @@ DEFAULT(StopGradient)
 DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
-DEFAULT(Cholesky)

 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
@@ -102,7 +98,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary_op<float>(
+    binary(
        a,
        b,
        out,
@@ -117,7 +113,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vadd((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary_op<int>(
+    binary(
        a,
        b,
        out,
@@ -132,7 +128,7 @@ void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vaddi((const int*)a, 1, (const int*)b, 1, (int*)o, 1, n);
        });
  } else {
-    eval(inputs, out);
+    binary(a, b, out, [](auto x, auto y) { return x + y; });
  }
 }

@@ -196,26 +192,6 @@ void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  if (out.dtype() == float32 && a.flags().row_contiguous &&
-      b.flags().row_contiguous) {
-    if (a.is_donatable()) {
-      out.copy_shared_buffer(a);
-    } else if (b.is_donatable()) {
-      out.copy_shared_buffer(b);
-    } else {
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    }
-    int size = a.data_size();
-    vvatan2f(out.data<float>(), a.data<float>(), b.data<float>(), &size);
-  } else {
-    eval(inputs, out);
-  }
-}
-
 void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -287,7 +263,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == int32) {
-    binary_op<int>(
+    binary(
        a,
        b,
        out,
@@ -300,7 +276,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdivi((const int*)b, 1, (const int*)a, 1, (int*)o, 1, n);
        });
  } else if (a.dtype() == float32) {
-    binary_op<float>(
+    binary(
        a,
        b,
        out,
@@ -315,7 +291,7 @@ void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vdiv((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else {
-    eval(inputs, out);
+    binary(a, b, out, [](auto x, auto y) { return x / y; });
  }
 }

@@ -326,8 +302,12 @@ void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    auto size = in.data_size();
    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, [](auto x) { return std::exp(x); });
  } else {
-    eval(inputs, out);
+    throw std::invalid_argument(
+        "[exp] Cannot exponentiate elements in array"
+        " with non floating point type.");
  }
 }

@@ -389,8 +369,12 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vvlog1pf(
        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, [](auto x) { return std::log1p(x); });
  } else {
-    eval(inputs, out);
+    throw std::invalid_argument(
+        "[log1p] Cannot compute log of elements in array with"
+        " non floating point type.");
  }
 }

@@ -400,7 +384,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary_op<float>(
+    binary(
        a,
        b,
        out,
@@ -415,7 +399,7 @@ void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vmul((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
        });
  } else {
-    eval(inputs, out);
+    binary(a, b, out, [](auto x, auto y) { return x * y; });
  }
 }

@@ -426,7 +410,7 @@ void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
    set_unary_output_data(in, out);
    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else {
-    eval(inputs, out);
+    unary(in, out, [](auto x) { return -x; });
  }
 }

@@ -513,7 +497,7 @@ void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
    auto size = in.data_size();
    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
  } else {
-    eval(inputs, out);
+    unary(in, out, [](auto x) { return x * x; });
  }
 }

@@ -539,7 +523,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& b = inputs[1];

  if (a.dtype() == float32) {
-    binary_op<float>(
+    binary(
        a,
        b,
        out,
@@ -557,7 +541,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
          vDSP_vsub((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
        });
  } else if (a.dtype() == int32) {
-    binary_op<int>(
+    binary(
        a,
        b,
        out,
@@ -569,7 +553,7 @@ void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
        },
        UseDefaultBinaryOp());
  } else {
-    eval(inputs, out);
+    binary(a, b, out, [](auto x, auto y) { return x - y; });
  }
 }

--- a/mlx/backend/accelerate/reduce.cpp
+++ b/mlx/backend/accelerate/reduce.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

-#include <Accelerate/Accelerate.h>
 #include <simd/vector.h>
+#include <vecLib/vDSP.h>

 #include "mlx/backend/common/reduce.h"
 #include "mlx/primitives.h"
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -3,10 +3,7 @@
 #include <cassert>
 #include <limits>

-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_neon.h>
-#endif
-
 #include <simd/math.h>
 #include <simd/vector.h>

@@ -56,26 +53,25 @@ inline simd_float16 simd_fast_exp(simd_float16 x) {
  return (*(simd_float16*)&epart) * x;
 }

-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /**
 * The ARM neon equivalent of the fast exp above.
 */
 inline float16x8_t neon_fast_exp(float16x8_t x) {
-  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
-  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
-  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14
+  x = vmulq_f16(x, vdupq_n_f16(1.442695)); // multiply with log_2(e)
+  x = vmaxq_f16(x, vdupq_n_f16(-14)); // clamp under with -14
+  x = vminq_f16(x, vdupq_n_f16(14)); // clamp over with 14

-  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
+  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(0.5)));
  float16x8_t fpart = vsubq_f16(x, ipart);

-  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
-  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
-  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);
+  x = vdupq_n_f16(1.535336188319500e-4f);
+  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(1.339887440266574e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(9.618437357674640e-3f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(5.550332471162809e-2f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(2.402264791363012e-1f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(6.931472028550421e-1f), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(1.000000000000000f), x, fpart);

  // generate 2**ipart in the floating point representation using integer
  // bitshifting
@@ -111,55 +107,6 @@ inline float16_t neon_reduce_add(float16x8_t x) {
  return vget_lane_f16(y, 0);
 }

-template <typename T, typename VT>
-struct NeonFp16SimdOps {
-  VT init(T a) {
-    return vdupq_n_f16(a);
-  }
-
-  VT load(const T* a) {
-    return vld1q_f16(a);
-  }
-
-  void store(T* dst, VT x) {
-    vst1q_f16(dst, x);
-  }
-
-  VT max(VT a, VT b) {
-    return vmaxq_f16(a, b);
-  }
-
-  VT exp(VT x) {
-    return neon_fast_exp(x);
-  }
-
-  VT add(VT a, VT b) {
-    return vaddq_f16(a, b);
-  }
-
-  VT sub(VT a, T b) {
-    return vsubq_f16(a, vdupq_n_f16(b));
-  }
-
-  VT mul(VT a, VT b) {
-    return vmulq_f16(a, b);
-  }
-
-  VT mul(VT a, T b) {
-    return vmulq_f16(a, vdupq_n_f16(b));
-  }
-
-  T reduce_max(VT x) {
-    return neon_reduce_max(x);
-  }
-
-  T reduce_add(VT x) {
-    return neon_reduce_add(x);
-  }
-};
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
 template <typename T, typename VT>
 struct AccelerateSimdOps {
  VT init(T a) {
@@ -176,7 +123,7 @@ struct AccelerateSimdOps {

  VT max(VT a, VT b) {
    return simd_max(a, b);
-  }
+  };

  VT exp(VT x) {
    return simd_fast_exp(x);
@@ -207,6 +154,53 @@ struct AccelerateSimdOps {
  }
 };

+template <typename T, typename VT>
+struct NeonFp16SimdOps {
+  VT init(T a) {
+    return vdupq_n_f16(a);
+  }
+
+  VT load(const T* a) {
+    return vld1q_f16(a);
+  }
+
+  void store(T* dst, VT x) {
+    vst1q_f16(dst, x);
+  }
+
+  VT max(VT a, VT b) {
+    return vmaxq_f16(a, b);
+  };
+
+  VT exp(VT x) {
+    return neon_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return vaddq_f16(a, b);
+  }
+
+  VT sub(VT a, T b) {
+    return vsubq_f16(a, vdupq_n_f16(b));
+  }
+
+  VT mul(VT a, VT b) {
+    return vmulq_f16(a, b);
+  }
+
+  VT mul(VT a, T b) {
+    return vmulq_f16(a, vdupq_n_f16(b));
+  }
+
+  T reduce_max(VT x) {
+    return neon_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return neon_reduce_add(x);
+  }
+};
+
 template <typename T, typename AccT, typename VT, typename Ops, int N>
 void softmax(const array& in, array& out) {
  Ops ops;
@@ -368,16 +362,12 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
            AccelerateSimdOps<float, simd_float16>,
            16>(in, out);
      } else {
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        softmax<
            float16_t,
            float16_t,
            float16x8_t,
            NeonFp16SimdOps<float16_t, float16x8_t>,
            8>(in, out);
-#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        eval(inputs, out); // Redirect to common backend for consistency
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
      }
      break;
    case bfloat16:
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@@ -1,8 +1,8 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2023 Apple Inc.

 #pragma once

-#include <Accelerate/Accelerate.h>
+#include <vecLib/BNNS/bnns.h>
 #include "mlx/dtype.h"

 namespace mlx::core {
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -37,20 +37,16 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
@@ -59,7 +55,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_impl.cpp
  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )

--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -196,20 +196,6 @@ void LogAddExp::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
-  auto& in1 = inputs[0];
-  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalAnd());
-}
-
-void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2); // LogicalOr requires two input arrays
-  auto& in1 = inputs[0];
-  auto& in2 = inputs[1];
-  binary(in1, in2, out, detail::LogicalOr());
-}
-
 void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -307,25 +293,4 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArcTan2::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  const auto& a = inputs[0];
-  const auto& b = inputs[1];
-  if (out.dtype() == float32) {
-    binary_op<float>(a, b, out, detail::ArcTan2());
-  } else if (out.dtype() == float16) {
-    binary_op<float16_t>(a, b, out, detail::ArcTan2());
-  } else if (out.dtype() == bfloat16) {
-    binary_op<bfloat16_t>(a, b, out, detail::ArcTan2());
-  } else if (issubdtype(out.dtype(), inexact)) {
-    std::ostringstream err;
-    err << "[arctan2] Does not support " << out.dtype();
-    throw std::invalid_argument(err.str());
-  } else {
-    throw std::invalid_argument(
-        "[arctan2] Cannot compute inverse tangent for arrays"
-        " with non floating point type.");
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,8 +1,6 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
-#include <cassert>
-
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -1,101 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include "mlx/allocator.h"
-#include "mlx/backend/common/copy.h"
-#include "mlx/linalg.h"
-#include "mlx/primitives.h"
-
-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <lapack.h>
-#endif
-
-namespace mlx::core {
-
-namespace {
-
-// Delegate to the Cholesky factorization taking into account differences in
-// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
-int spotrf_wrapper(char uplo, float* matrix, int N) {
-  int info;
-
-#ifdef LAPACK_FORTRAN_STRLEN_END
-  spotrf_(
-      /* uplo = */ &uplo,
-      /* n = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info,
-      /* uplo_len = */ static_cast<size_t>(1));
-#else
-  spotrf_(
-      /* uplo = */ &uplo,
-      /* n = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info);
-#endif
-
-  return info;
-}
-
-} // namespace
-
-void cholesky_impl(const array& a, array& factor, bool upper) {
-  // Lapack uses the column-major convention. We take advantage of the fact that
-  // the matrix should be symmetric:
-  //   (A)ᵀ = A
-  // and that a column-major lower triangular matrix is a row-major upper
-  // triangular matrix, so uplo is the opposite of what we would expect from
-  // upper
-
-  char uplo = (upper) ? 'L' : 'U';
-
-  // The decomposition is computed in place, so just copy the input to the
-  // output.
-  copy(
-      a,
-      factor,
-      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-
-  const int N = a.shape(-1);
-  const size_t num_matrices = a.size() / (N * N);
-
-  float* matrix = factor.data<float>();
-
-  for (int i = 0; i < num_matrices; i++) {
-    // Compute Cholesky factorization.
-    int info = spotrf_wrapper(uplo, matrix, N);
-
-    // TODO: We do nothing when the matrix is not positive semi-definite
-    // because throwing an error would result in a crash. If we figure out how
-    // to catch errors from the implementation we should throw.
-    if (info < 0) {
-      std::stringstream msg;
-      msg << "[cholesky] Cholesky decomposition failed with error code "
-          << info;
-      throw std::runtime_error(msg.str());
-    }
-
-    // Zero out the upper/lower triangle while advancing the pointer to the
-    // next matrix at the same time.
-    for (int row = 0; row < N; row++) {
-      if (upper) {
-        std::fill(matrix, matrix + row, 0);
-      } else {
-        std::fill(matrix + row + 1, matrix + N, 0);
-      }
-      matrix += N;
-    }
-  }
-}
-
-void Cholesky::eval(const std::vector<array>& inputs, array& output) {
-  if (inputs[0].dtype() != float32) {
-    throw std::runtime_error("[Cholesky::eval] only supports float32.");
-  }
-  cholesky_impl(inputs[0], output, upper_);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -1,304 +0,0 @@
-// Copyright © 2024 Apple Inc.
-#include <cassert>
-
-#include "mlx/backend/common/utils.h"
-#include "mlx/primitives.h"
-
-namespace mlx::core {
-
-void AsStrided::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-
-  auto& in = inputs[0];
-
-  if (!in.flags().row_contiguous) {
-    // Just ensuring that inputs[0] came from the ops which would ensure the
-    // input is row contiguous.
-    throw std::runtime_error(
-        "AsStrided must be used with row contiguous arrays only.");
-  }
-
-  // Compute the flags given the shape and strides
-  bool row_contiguous = true, col_contiguous = true;
-  size_t r = 1, c = 1;
-  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
-    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
-    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
-    r *= shape_[i];
-    c *= shape_[j];
-  }
-  auto flags = in.flags();
-  // TODO: Compute the contiguous flag in a better way cause now we are
-  //       unnecessarily strict.
-  flags.contiguous = row_contiguous || col_contiguous;
-  flags.row_contiguous = row_contiguous;
-  flags.col_contiguous = col_contiguous;
-
-  // There is no easy way to compute the actual data size so we use out.size().
-  // The contiguous flag will almost certainly not be set so no code should
-  // rely on data_size anyway.
-  size_t data_size = out.size();
-
-  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
-}
-
-void Broadcast::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  std::vector<size_t> strides(out.ndim(), 0);
-  int diff = out.ndim() - in.ndim();
-  for (int i = in.ndim() - 1; i >= 0; --i) {
-    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
-  }
-  auto flags = in.flags();
-  if (out.size() > in.size()) {
-    flags.row_contiguous = flags.col_contiguous = false;
-  }
-  out.copy_shared_buffer(in, strides, flags, in.data_size());
-}
-
-void Copy::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
-}
-
-void CustomTransforms::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() > outputs.size());
-  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
-       i++, j++) {
-    outputs[i].copy_shared_buffer(inputs[j]);
-  }
-}
-
-void Depends::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() > outputs.size());
-  for (int i = 0; i < outputs.size(); i++) {
-    outputs[i].copy_shared_buffer(inputs[i]);
-  }
-}
-
-void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  double numel = 1;
-  for (auto ax : axes_) {
-    numel *= inputs[0].shape(ax);
-  }
-
-  if (inverted_) {
-    numel = 1.0 / numel;
-  }
-
-  switch (out.dtype()) {
-    case bool_:
-      *out.data<bool>() = static_cast<bool>(numel);
-      break;
-    case uint8:
-      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
-      break;
-    case uint16:
-      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
-      break;
-    case uint32:
-      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
-      break;
-    case uint64:
-      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
-      break;
-    case int8:
-      *out.data<int8_t>() = static_cast<int8_t>(numel);
-      break;
-    case int16:
-      *out.data<int16_t>() = static_cast<int16_t>(numel);
-      break;
-    case int32:
-      *out.data<int32_t>() = static_cast<int32_t>(numel);
-      break;
-    case int64:
-      *out.data<int64_t>() = static_cast<int64_t>(numel);
-      break;
-    case float16:
-      *out.data<float16_t>() = static_cast<float16_t>(numel);
-      break;
-    case float32:
-      *out.data<float>() = static_cast<float>(numel);
-      break;
-    case bfloat16:
-      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
-      break;
-    case complex64:
-      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
-      break;
-  }
-}
-
-std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
-    const array& in,
-    const array& out) {
-  // Special case for empty arrays or row contiguous arrays
-  if (in.size() == 0 || in.flags().row_contiguous) {
-    return {false, out.strides()};
-  }
-
-  // Special case for scalars
-  if (in.ndim() == 0) {
-    std::vector<size_t> out_strides(out.ndim(), 0);
-    return {false, out_strides};
-  }
-
-  // Firstly let's collapse all the contiguous dimensions of the input
-  auto [shape, _strides] = collapse_contiguous_dims(in);
-  auto& strides = _strides[0];
-
-  // If shapes fit exactly in the contiguous dims then no copy is necessary so
-  // let's check.
-  std::vector<size_t> out_strides;
-  bool copy_necessary = false;
-  int j = 0;
-  for (int i = 0; i < out.ndim(); i++) {
-    int N = out.shape(i);
-    if (j < shape.size() && shape[j] % N == 0) {
-      shape[j] /= N;
-      out_strides.push_back(shape[j] * strides[j]);
-      j += (shape[j] == 1);
-    } else if (N == 1) {
-      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
-      out_strides.push_back(out_strides.back());
-    } else {
-      copy_necessary = true;
-      break;
-    }
-  }
-
-  return {copy_necessary, out_strides};
-}
-
-void Reshape::shared_buffer_reshape(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    array& out) {
-  auto flags = in.flags();
-  if (flags.row_contiguous) {
-    // For row contiguous reshapes:
-    // - Shallow copy the buffer
-    // - If reshaping into a vector (all singleton dimensions except one) it
-    //    becomes col contiguous again.
-    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
-    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
-  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
-}
-
-void Split::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  assert(inputs.size() == 1);
-
-  auto& in = inputs[0];
-
-  auto compute_new_flags = [](const auto& shape,
-                              const auto& strides,
-                              size_t in_data_size,
-                              auto flags) {
-    size_t data_size = 1;
-    size_t f_stride = 1;
-    size_t b_stride = 1;
-    flags.row_contiguous = true;
-    flags.col_contiguous = true;
-    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
-      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
-      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
-      f_stride *= shape[i];
-      b_stride *= shape[ri];
-      if (strides[i] > 0) {
-        data_size *= shape[i];
-      }
-    }
-
-    if (data_size == 1) {
-      // Broadcasted scalar array is contiguous.
-      flags.contiguous = true;
-    } else if (data_size == in_data_size) {
-      // Means we sliced a broadcasted dimension so leave the "no holes" flag
-      // alone.
-    } else {
-      // We sliced something. So either we are row or col contiguous or we
-      // punched a hole.
-      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
-    }
-
-    return std::pair<decltype(flags), size_t>{flags, data_size};
-  };
-
-  std::vector<int> indices(1, 0);
-  indices.insert(indices.end(), indices_.begin(), indices_.end());
-  for (int i = 0; i < indices.size(); i++) {
-    size_t offset = indices[i] * in.strides()[axis_];
-    auto [new_flags, data_size] = compute_new_flags(
-        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
-    outputs[i].copy_shared_buffer(
-        in, in.strides(), new_flags, data_size, offset);
-  }
-}
-
-std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-  }
-
-  return std::make_tuple(data_offset, inp_strides);
-}
-
-void StopGradient::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
-}
-
-void Transpose::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  std::vector<size_t> out_strides(out.ndim());
-  auto& in = inputs[0];
-  for (int ax = 0; ax < axes_.size(); ++ax) {
-    out_strides[ax] = in.strides()[axes_[ax]];
-  }
-
-  // Conditions for {row/col}_contiguous
-  // - array must be contiguous (no gaps)
-  // - underlying buffer size should have the same size as the array
-  // - cumulative product of shapes is equal to the strides (we can ignore axes
-  //   with size == 1)
-  //   - in the forward direction (column contiguous)
-  //   - in the reverse direction (row contiguous)
-  // - vectors are both row and col contiguous (hence if both row/col are
-  //   true, they stay true)
-  auto flags = in.flags();
-  if (flags.contiguous && in.data_size() == in.size()) {
-    size_t f_stride = 1;
-    size_t b_stride = 1;
-    flags.col_contiguous = true;
-    flags.row_contiguous = true;
-    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
-      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
-      f_stride *= out.shape(i);
-      flags.row_contiguous &=
-          (out_strides[ri] == b_stride || out.shape(ri) == 1);
-      b_stride *= out.shape(ri);
-    }
-  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -205,8 +205,8 @@ void compiled_allocate_outputs(
      // - Donatable
      // - Correct size
      // - Not a constant
-      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
-          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
+      if (in.flags().row_contiguous && in.nbytes() == outputs[o].nbytes() &&
+          in.is_donatable() &&
          constant_ids_.find(inputs_[i].id()) == constant_ids_.end()) {
        if (move_buffers) {
          outputs[o].move_shared_buffer(
--- a/mlx/backend/common/compiled_cpu.cpp
+++ b/mlx/backend/common/compiled_cpu.cpp
@@ -2,7 +2,6 @@

 #include <dlfcn.h>
 #include <filesystem>
-#include <fstream>
 #include <list>

 #include "mlx/backend/common/compiled.h"
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -111,17 +111,13 @@ void slow_conv_2D(
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
  const int iW = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
-  const int C = in.shape(3); // In channels
  const int oH = out.shape(1); // Output spatial dim
  const int oW = out.shape(2); // Output spatial dim
  const int O = wt.shape(0); // Out channels
+  const int C = wt.shape(3); // In channels
  const int wH = wt.shape(1); // Weight spatial dim
  const int wW = wt.shape(2); // Weight spatial dim

-  const int groups = C / wt.shape(3);
-  const int C_per_group = wt.shape(3);
-  const int O_per_group = O / groups;
-
  const size_t in_stride_N = in.strides()[0];
  const size_t in_stride_H = in.strides()[1];
  const size_t in_stride_W = in.strides()[2];
@@ -145,35 +141,33 @@ void slow_conv_2D(
        int ih_base = oh * wt_strides[0] - padding[0];
        int iw_base = ow * wt_strides[1] - padding[1];

-        for (int g = 0; g < groups; ++g) {
-          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-            float r = 0.;
+        for (int o = 0; o < O; ++o) {
+          float r = 0.;

-            for (int wh = 0; wh < wH; ++wh) {
-              for (int ww = 0; ww < wW; ++ww) {
-                int wh_flip = flip ? wH - wh - 1 : wh;
-                int ww_flip = flip ? wW - ww - 1 : ww;
-                int ih = ih_base + wh_flip * wt_dilation[0];
-                int iw = iw_base + ww_flip * wt_dilation[1];
+          for (int wh = 0; wh < wH; ++wh) {
+            for (int ww = 0; ww < wW; ++ww) {
+              int wh_flip = flip ? wH - wh - 1 : wh;
+              int ww_flip = flip ? wW - ww - 1 : ww;
+              int ih = ih_base + wh_flip * wt_dilation[0];
+              int iw = iw_base + ww_flip * wt_dilation[1];

-                const T* wt_ptr_pt =
-                    wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
-                const T* in_ptr_pt =
-                    in_ptr + ih * in_stride_H + iw * in_stride_W;
+              const T* wt_ptr_pt = wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+              const T* in_ptr_pt = in_ptr + ih * in_stride_H + iw * in_stride_W;

-                for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
-                  r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                      static_cast<float>(
-                           wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-                } // c
-              } // ww
-            } // wh
+              for (int c = 0; c < C; ++c) {
+                r += static_cast<float>(in_ptr_pt[0]) *
+                    static_cast<float>(wt_ptr_pt[0]);
+                in_ptr_pt += in_stride_C;
+                wt_ptr_pt += wt_stride_C;
+              } // c

-            out_ptr[0] = static_cast<T>(r);
-            out_ptr += out_stride_O;
-            wt_ptr += wt_stride_O;
-          } // o
-        } // g
+            } // ww
+          } // wh
+
+          out_ptr[0] = static_cast<T>(r);
+          out_ptr += out_stride_O;
+          wt_ptr += wt_stride_O;
+        } // o
      };

  int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
@@ -225,43 +219,41 @@ void slow_conv_2D(
        int wh_base = base_h[oh % f_out_jump_h];
        int ww_base = base_w[ow % f_out_jump_w];

-        for (int g = 0; g < groups; ++g) {
-          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
-            float r = 0.;
+        for (int o = 0; o < O; ++o) {
+          float r = 0.;

-            for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
-              for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
-                int wh_flip = flip ? wH - wh - 1 : wh;
-                int ww_flip = flip ? wW - ww - 1 : ww;
-                int ih = ih_base + wh_flip * wt_dilation[0];
-                int iw = iw_base + ww_flip * wt_dilation[1];
+          for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
+            for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
+              int wh_flip = flip ? wH - wh - 1 : wh;
+              int ww_flip = flip ? wW - ww - 1 : ww;
+              int ih = ih_base + wh_flip * wt_dilation[0];
+              int iw = iw_base + ww_flip * wt_dilation[1];

-                if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
-                  const T* wt_ptr_pt =
-                      wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
+              if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
+                const T* wt_ptr_pt =
+                    wt_ptr + wh * wt_stride_H + ww * wt_stride_W;

-                  int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
-                  int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;
+                int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
+                int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;

-                  const T* in_ptr_pt =
-                      in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;
+                const T* in_ptr_pt =
+                    in_ptr + ih_dil * in_stride_H + iw_dil * in_stride_W;

-                  for (int c = g * C_per_group; c < (g + 1) * C_per_group;
-                       ++c) {
-                    r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
-                        static_cast<float>(
-                             wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
-                  } // c
+                for (int c = 0; c < C; ++c) {
+                  r += static_cast<float>(in_ptr_pt[0]) *
+                      static_cast<float>(wt_ptr_pt[0]);
+                  in_ptr_pt += in_stride_C;
+                  wt_ptr_pt += wt_stride_C;
+                } // c

-                } // ih, iw check
-              } // ww
-            } // wh
+              } // ih, iw check
+            } // ww
+          } // wh

-            out_ptr[0] = static_cast<T>(r);
-            out_ptr += out_stride_O;
-            wt_ptr += wt_stride_O;
-          } // o
-        } // g
+          out_ptr[0] = static_cast<T>(r);
+          out_ptr += out_stride_O;
+          wt_ptr += wt_stride_O;
+        } // o
      };

  int oH_border_0 = 0;
@@ -318,296 +310,6 @@ void slow_conv_2D(
  } // n
 }

-template <typename T>
-void slow_conv_3D(
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
-    const std::vector<int>& in_dilation,
-    bool flip) {
-  const T* st_wt_ptr = wt.data<T>();
-  const T* st_in_ptr = in.data<T>();
-  T* st_out_ptr = out.data<T>();
-
-  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const int iD = 1 + in_dilation[0] * (in.shape(1) - 1); // Input spatial dim
-  const int iH = 1 + in_dilation[1] * (in.shape(2) - 1); // Input spatial dim
-  const int iW = 1 + in_dilation[2] * (in.shape(3) - 1); // Input spatial dim
-  const int oD = out.shape(1); // Output spatial dim
-  const int oH = out.shape(2); // Output spatial dim
-  const int oW = out.shape(3); // Output spatial dim
-  const int O = wt.shape(0); // Out channels
-  const int C = wt.shape(4); // In channels
-  const int wD = wt.shape(1); // Weight spatial dim
-  const int wH = wt.shape(2); // Weight spatial dim
-  const int wW = wt.shape(3); // Weight spatial dim
-
-  const size_t in_stride_N = in.strides()[0];
-  const size_t in_stride_D = in.strides()[1];
-  const size_t in_stride_H = in.strides()[2];
-  const size_t in_stride_W = in.strides()[3];
-  const size_t in_stride_C = in.strides()[4];
-
-  const size_t wt_stride_O = wt.strides()[0];
-  const size_t wt_stride_D = wt.strides()[1];
-  const size_t wt_stride_H = wt.strides()[2];
-  const size_t wt_stride_W = wt.strides()[3];
-  const size_t wt_stride_C = wt.strides()[4];
-
-  const size_t out_stride_N = out.strides()[0];
-  const size_t out_stride_D = out.strides()[1];
-  const size_t out_stride_H = out.strides()[2];
-  const size_t out_stride_W = out.strides()[3];
-  const size_t out_stride_O = out.strides()[4];
-
-  bool is_idil_one =
-      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1;
-
-  auto pt_conv_no_checks = [&](const T* in_ptr,
-                               const T* wt_ptr,
-                               T* out_ptr,
-                               int od,
-                               int oh,
-                               int ow) {
-    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
-    int id_base = od * wt_strides[0] - padding[0];
-    int ih_base = oh * wt_strides[1] - padding[1];
-    int iw_base = ow * wt_strides[2] - padding[2];
-
-    for (int o = 0; o < O; ++o) {
-      float r = 0.;
-
-      for (int wd = 0; wd < wD; ++wd) {
-        for (int wh = 0; wh < wH; ++wh) {
-          for (int ww = 0; ww < wW; ++ww) {
-            int wd_flip = flip ? wD - wd - 1 : wd;
-            int wh_flip = flip ? wH - wh - 1 : wh;
-            int ww_flip = flip ? wW - ww - 1 : ww;
-            int id = id_base + wd_flip * wt_dilation[0];
-            int ih = ih_base + wh_flip * wt_dilation[1];
-            int iw = iw_base + ww_flip * wt_dilation[2];
-
-            const T* wt_ptr_pt =
-                wt_ptr + wd * wt_stride_D + wh * wt_stride_H + ww * wt_stride_W;
-            const T* in_ptr_pt =
-                in_ptr + id * in_stride_D + ih * in_stride_H + iw * in_stride_W;
-
-            for (int c = 0; c < C; ++c) {
-              r += static_cast<float>(in_ptr_pt[0]) *
-                  static_cast<float>(wt_ptr_pt[0]);
-              in_ptr_pt += in_stride_C;
-              wt_ptr_pt += wt_stride_C;
-            } // c
-
-          } // ww
-        } // wh
-      } // wd
-
-      out_ptr[0] = static_cast<T>(r);
-      out_ptr += out_stride_O;
-      wt_ptr += wt_stride_O;
-    } // o
-  };
-
-  int jump_d = flip ? -wt_dilation[0] : wt_dilation[0];
-  int jump_h = flip ? -wt_dilation[1] : wt_dilation[1];
-  int jump_w = flip ? -wt_dilation[2] : wt_dilation[2];
-
-  int init_d = (flip ? (wD - 1) * wt_dilation[0] : 0);
-  int init_h = (flip ? (wH - 1) * wt_dilation[1] : 0);
-  int init_w = (flip ? (wW - 1) * wt_dilation[2] : 0);
-
-  int f_wgt_jump_d = std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
-  int f_wgt_jump_h = std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
-  int f_wgt_jump_w = std::lcm(in_dilation[2], wt_dilation[2]) / wt_dilation[2];
-
-  int f_out_jump_d = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
-  int f_out_jump_h = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
-  int f_out_jump_w = std::lcm(in_dilation[2], wt_strides[2]) / wt_strides[2];
-
-  std::vector<int> base_d(f_out_jump_d);
-  std::vector<int> base_h(f_out_jump_h);
-  std::vector<int> base_w(f_out_jump_w);
-
-  for (int i = 0; i < f_out_jump_d; ++i) {
-    int id_loop = i * wt_strides[0] - padding[0] + init_d;
-
-    int wd_base = 0;
-    while (wd_base < wD && id_loop % in_dilation[0] != 0) {
-      wd_base++;
-      id_loop += jump_d;
-    }
-
-    base_d[i] = wd_base;
-  }
-
-  for (int i = 0; i < f_out_jump_h; ++i) {
-    int ih_loop = i * wt_strides[1] - padding[1] + init_h;
-
-    int wh_base = 0;
-    while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
-      wh_base++;
-      ih_loop += jump_h;
-    }
-
-    base_h[i] = wh_base;
-  }
-
-  for (int j = 0; j < f_out_jump_w; ++j) {
-    int iw_loop = j * wt_strides[2] - padding[2] + init_w;
-
-    int ww_base = 0;
-    while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
-      ww_base++;
-      iw_loop += jump_w;
-    }
-
-    base_w[j] = ww_base;
-  }
-
-  auto pt_conv_all_checks = [&](const T* in_ptr,
-                                const T* wt_ptr,
-                                T* out_ptr,
-                                int od,
-                                int oh,
-                                int ow) {
-    out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
-
-    int id_base = od * wt_strides[0] - padding[0];
-    int ih_base = oh * wt_strides[1] - padding[1];
-    int iw_base = ow * wt_strides[2] - padding[2];
-
-    int wd_base = base_d[od % f_out_jump_d];
-    int wh_base = base_h[oh % f_out_jump_h];
-    int ww_base = base_w[ow % f_out_jump_w];
-
-    for (int o = 0; o < O; ++o) {
-      float r = 0.;
-
-      for (int wd = wd_base; wd < wD; wd += f_wgt_jump_d) {
-        for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
-          for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
-            int wd_flip = flip ? wD - wd - 1 : wd;
-            int wh_flip = flip ? wH - wh - 1 : wh;
-            int ww_flip = flip ? wW - ww - 1 : ww;
-            int id = id_base + wd_flip * wt_dilation[0];
-            int ih = ih_base + wh_flip * wt_dilation[1];
-            int iw = iw_base + ww_flip * wt_dilation[2];
-
-            if (id >= 0 && id < iD && ih >= 0 && ih < iH && iw >= 0 &&
-                iw < iW) {
-              const T* wt_ptr_pt = wt_ptr + wd * wt_stride_D +
-                  wh * wt_stride_H + ww * wt_stride_W;
-
-              int id_dil = !is_idil_one ? (id / in_dilation[0]) : id;
-              int ih_dil = !is_idil_one ? (ih / in_dilation[1]) : ih;
-              int iw_dil = !is_idil_one ? (iw / in_dilation[2]) : iw;
-
-              const T* in_ptr_pt = in_ptr + id_dil * in_stride_D +
-                  ih_dil * in_stride_H + iw_dil * in_stride_W;
-
-              for (int c = 0; c < C; ++c) {
-                r += static_cast<float>(in_ptr_pt[0]) *
-                    static_cast<float>(wt_ptr_pt[0]);
-                in_ptr_pt += in_stride_C;
-                wt_ptr_pt += wt_stride_C;
-              } // c
-
-            } // iD, ih, iw check
-          } // ww
-        } // wh
-      } // wd
-
-      out_ptr[0] = static_cast<T>(r);
-      out_ptr += out_stride_O;
-      wt_ptr += wt_stride_O;
-    } // o
-  };
-
-  int oD_border_0 = 0;
-  int oD_border_1 =
-      is_idil_one ? ((padding[0] + wt_strides[0] - 1) / wt_strides[0]) : oD;
-  int oD_border_2 = std::max(
-      oD_border_1, (iD + padding[0] - wD * wt_dilation[0]) / wt_strides[0]);
-  int oD_border_3 = oD;
-
-  int oH_border_0 = 0;
-  int oH_border_1 =
-      is_idil_one ? ((padding[1] + wt_strides[1] - 1) / wt_strides[1]) : oH;
-  int oH_border_2 = std::max(
-      oH_border_1, (iH + padding[1] - wH * wt_dilation[1]) / wt_strides[1]);
-  int oH_border_3 = oH;
-
-  int oW_border_0 = 0;
-  int oW_border_1 =
-      is_idil_one ? ((padding[2] + wt_strides[2] - 1) / wt_strides[2]) : oW;
-  int oW_border_2 = std::max(
-      oW_border_1, (iW + padding[2] - wW * wt_dilation[2]) / wt_strides[2]);
-  int oW_border_3 = oW;
-
-  for (int n = 0; n < N; ++n) {
-    // Case 1: od might put us out of bounds
-    for (int od = oD_border_0; od < oD_border_1; ++od) {
-      for (int oh = 0; oh < oH; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-      } // oh
-    } // od
-
-    // Case 2: od in bounds
-    for (int od = oD_border_1; od < oD_border_2; ++od) {
-      // Case 2.1: oh might put us out of bounds
-      for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-      } // oh
-
-      // Case 2.2: oh in bounds
-      for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
-        // Case 2.2.1: ow might put us out of bounds
-        for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-
-        // Case 2.2.2: ow in bounds
-        for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
-          pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-
-        // Case 2.2.3: ow might put us out of bounds
-        for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-      } // oh
-
-      // Case 2.3: oh might put us out of bounds
-      for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-      } // oh
-    } // od
-
-    // Case 3: od might put us out of bounds
-    for (int od = oD_border_2; od < oD_border_3; ++od) {
-      for (int oh = 0; oh < oH; ++oh) {
-        for (int ow = 0; ow < oW; ++ow) {
-          pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
-        } // ow
-      } // oh
-    } // od
-
-    st_in_ptr += in_stride_N;
-    st_out_ptr += out_stride_N;
-
-  } // n
-}
-
 void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
@@ -656,30 +358,6 @@ void dispatch_slow_conv_2D(
  }
 }

-void dispatch_slow_conv_3D(
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
-    const std::vector<int>& in_dilation,
-    bool flip) {
-  if (in.dtype() == float32) {
-    return slow_conv_3D<float>(
-        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
-  } else if (in.dtype() == float16) {
-    return slow_conv_3D<float16_t>(
-        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
-  } else if (in.dtype() == bfloat16) {
-    return slow_conv_3D<bfloat16_t>(
-        in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
-  } else {
-    throw std::invalid_argument(
-        "[Convolution::eval] got unsupported data type.");
-  }
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // Explicit gemm conv
 ///////////////////////////////////////////////////////////////////////////////
@@ -904,131 +582,6 @@ void explicit_gemm_conv_2D_cpu(
  }
 }

-void explicit_gemm_conv_ND_cpu(
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation) {
-  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const auto iDim = std::vector<int>(
-      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
-  const auto oDim = std::vector<int>(
-      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
-  const int O = wt.shape(0); // Out channels
-  const int C = wt.shape(-1); // In channels
-  const auto wDim = std::vector<int>(
-      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
-
-  auto conv_dtype = float32;
-
-  // Pad input
-  std::vector<int> padded_shape(in.shape().size());
-  padded_shape.front() = N;
-  for (size_t i = 0; i < iDim.size(); i++) {
-    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
-  }
-  padded_shape.back() = C;
-  array in_padded(padded_shape, conv_dtype, nullptr, {});
-
-  // Fill with zeros
-  copy(array(0, conv_dtype), in_padded, CopyType::Scalar);
-
-  // Pick input slice from padded
-  size_t data_offset = 0;
-  for (size_t i = 0; i < padding.size(); i++) {
-    data_offset += padding[i] * in_padded.strides()[i + 1];
-  }
-  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
-  in_padded_slice.copy_shared_buffer(
-      in_padded,
-      in_padded.strides(),
-      in_padded.flags(),
-      in_padded_slice.size(),
-      data_offset);
-
-  // Copy input values into the slice
-  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);
-
-  // Make strided view
-  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
-  strided_shape.front() = N;
-  for (size_t i = 0; i < oDim.size(); i++) {
-    strided_shape[i + 1] = oDim[i];
-  }
-  for (size_t i = 0; i < wDim.size(); i++) {
-    strided_shape[i + 1 + oDim.size()] = wDim[i];
-  }
-  strided_shape.back() = C;
-
-  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
-  strided_strides[0] = in_padded.strides()[0];
-  for (size_t i = 0; i < wt_strides.size(); i++) {
-    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
-  }
-  for (size_t i = 1; i < in_padded.strides().size(); i++) {
-    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
-  }
-
-  auto flags = in_padded.flags();
-
-  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
-  in_strided_view.copy_shared_buffer(
-      in_padded, strided_strides, flags, in_strided_view.size(), 0);
-
-  // Materialize strided view
-  std::vector<int> strided_reshape = {N, C};
-  for (const auto& o : oDim) {
-    strided_reshape[0] *= o;
-  }
-  for (const auto& w : wDim) {
-    strided_reshape[1] *= w;
-  }
-
-  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
-  copy(in_strided_view, in_strided, CopyType::General);
-
-  // Check wt dtype and prepare
-  auto gemm_wt = wt;
-  auto gemm_out = out;
-
-  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
-    auto ctype =
-        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-    gemm_wt = array(wt.shape(), float32, nullptr, {});
-    copy(wt, gemm_wt, ctype);
-  }
-
-  if (out.dtype() != float32) {
-    gemm_out = array(out.shape(), float32, nullptr, {});
-    gemm_out.set_data(allocator::malloc_or_wait(gemm_out.nbytes()));
-  }
-
-  // Perform gemm
-  cblas_sgemm(
-      CblasRowMajor,
-      CblasNoTrans, // no trans A
-      CblasTrans, // transB
-      strided_reshape[0], // M
-      O, // N
-      strided_reshape[1], // K
-      1.0f, // alpha
-      in_strided.data<float>(),
-      strided_reshape[1], // lda
-      gemm_wt.data<float>(),
-      strided_reshape[1], // ldb
-      0.0f, // beta
-      gemm_out.data<float>(),
-      O // ldc
-  );
-
-  // Copy results if needed
-  if (out.dtype() != float32) {
-    copy(gemm_out, out, CopyType::Vector);
-  }
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // Conv routing
 ///////////////////////////////////////////////////////////////////////////////
@@ -1064,19 +617,6 @@ void conv_2D_cpu(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }

-void conv_3D_cpu(
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
-    const std::vector<int>& in_dilation,
-    bool flip) {
-  return dispatch_slow_conv_3D(
-      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
-}
-
 } // namespace

 void Convolution::eval(const std::vector<array>& inputs, array& out) {
@@ -1085,20 +625,8 @@ void Convolution::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  auto& wt = inputs[1];

-  // 3D convolution
-  if (in.ndim() == (3 + 2)) {
-    return conv_3D_cpu(
-        in,
-        wt,
-        out,
-        padding_,
-        kernel_strides_,
-        kernel_dilation_,
-        input_dilation_,
-        flip_);
-  }
  // 2D convolution
-  else if (in.ndim() == (2 + 2)) {
+  if (in.ndim() == (2 + 2)) {
    return conv_2D_cpu(
        in,
        wt,
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -4,7 +4,6 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
-#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -143,31 +142,29 @@ void copy_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    int64_t i_offset) {
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      data_shape, std::vector<std::vector<stride_t>>{i_strides});
-  switch (new_shape.size()) {
+  switch (src.ndim()) {
    case 1:
      copy_general_dim1<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
+          src, dst, data_shape, i_strides, i_offset);
      return;
    case 2:
      copy_general_dim2<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
+          src, dst, data_shape, i_strides, i_offset);
      return;
    case 3:
      copy_general_dim3<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
+          src, dst, data_shape, i_strides, i_offset);
      return;
    case 4:
      copy_general_dim4<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
+          src, dst, data_shape, i_strides, i_offset);
      return;
  }

  auto src_ptr = src.data<SrcT>() + i_offset;
  auto dst_ptr = dst.data<DstT>();
  for (size_t i = 0; i < dst.size(); ++i) {
-    stride_t src_elem = elem_to_loc(i, new_shape, new_strides[0]);
+    stride_t src_elem = elem_to_loc(i, data_shape, i_strides);
    dst_ptr[i] = static_cast<DstT>(src_ptr[src_elem]);
  }
 }
@@ -198,10 +195,10 @@ inline void copy_general_general_dims(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset) {
+    stride_t i_offset,
+    stride_t o_offset) {
  if constexpr (D > 1) {
-    int axis = data_shape.size() - D;
+    int axis = src.ndim() - D;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -212,7 +209,7 @@ inline void copy_general_general_dims(
      o_offset += stride_dst;
    }
  } else {
-    int axis = data_shape.size() - 1;
+    int axis = src.ndim() - 1;
    auto stride_src = i_strides[axis];
    auto stride_dst = o_strides[axis];
    auto N = data_shape[axis];
@@ -233,76 +230,38 @@ void copy_general_general(
    const std::vector<int>& data_shape,
    const std::vector<stride_t>& i_strides,
    const std::vector<stride_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset) {
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      data_shape, std::vector<std::vector<stride_t>>{i_strides, o_strides});
-  switch (new_shape.size()) {
+    stride_t i_offset,
+    stride_t o_offset) {
+  switch (src.ndim()) {
    case 1:
      copy_general_general_dims<SrcT, DstT, stride_t, 1>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
+          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
      return;
    case 2:
      copy_general_general_dims<SrcT, DstT, stride_t, 2>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
+          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
      return;
    case 3:
      copy_general_general_dims<SrcT, DstT, stride_t, 3>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
+          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
      return;
    case 4:
      copy_general_general_dims<SrcT, DstT, stride_t, 4>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
+          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
      return;
    case 5:
      copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
+          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
      return;
  }

  int size = std::accumulate(
-      new_shape.end() - 5, new_shape.end(), 1, std::multiplies<int>());
+      data_shape.begin() - 5, data_shape.end(), 1, std::multiplies<int>());
  for (int i = 0; i < src.size(); i += size) {
-    stride_t src_offset = i_offset + elem_to_loc(i, new_shape, new_strides[0]);
-    stride_t dst_offset = o_offset + elem_to_loc(i, new_shape, new_strides[1]);
+    stride_t src_offset = i_offset + elem_to_loc(i, data_shape, i_strides);
+    stride_t dst_offset = o_offset + elem_to_loc(i, dst.shape(), o_strides);
    copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-        src,
-        dst,
-        new_shape,
-        new_strides[0],
-        new_strides[1],
-        src_offset,
-        dst_offset);
+        src, dst, data_shape, i_strides, o_strides, src_offset, dst_offset);
  }
 }

@@ -485,17 +444,8 @@ void copy_inplace(
  }
 }

-template void copy_inplace<size_t>(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<size_t>& i_strides,
-    const std::vector<size_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
-template void copy_inplace<int64_t>(
+template <>
+void copy_inplace<int64_t>(
    const array& src,
    array& dst,
    const std::vector<int>& data_shape,
@@ -503,6 +453,24 @@ template void copy_inplace<int64_t>(
    const std::vector<int64_t>& o_strides,
    int64_t i_offset,
    int64_t o_offset,
-    CopyType ctype);
+    CopyType ctype) {
+  switch (ctype) {
+    case CopyType::General:
+    case CopyType::GeneralGeneral:
+      return copy_inplace_dispatch(
+          src,
+          dst,
+          ctype,
+          data_shape,
+          i_strides,
+          o_strides,
+          i_offset,
+          o_offset);
+
+    case CopyType::Scalar:
+    case CopyType::Vector:
+      return copy_inplace_dispatch(src, dst, ctype);
+  }
+}

 } // namespace mlx::core
--- a/mlx/backend/common/cpu_impl.cpp
+++ b/mlx/backend/common/cpu_impl.cpp
@@ -0,0 +1,47 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/backend/common/cpu_impl.h"
+#include "mlx/primitives.h"
+#include "mlx/scheduler.h"
+
+namespace mlx::core::cpu {
+
+std::function<void()> make_task(array arr, bool signal) {
+  return [arr = std::move(arr), signal]() mutable {
+    auto stream = arr.primitive().stream();
+
+    // Wait on inputs coming from different streams/devices.
+    for (auto& input : arr.inputs()) {
+      if (input.event().valid() && input.event().stream() != stream) {
+        input.event().wait();
+      }
+    }
+
+    // Task computation actually starting.
+    scheduler::notify_new_task(stream);
+
+    // Perform the computation
+    auto outputs = arr.outputs();
+    arr.primitive().eval_cpu(arr.inputs(), outputs);
+
+    // Check if we need to detach and signal other arrays waiting for the
+    // result to be ready.
+    if (!arr.is_tracer()) {
+      arr.detach();
+    }
+    if (signal) {
+      arr.event().signal();
+    }
+
+    // Task computation done.
+    scheduler::notify_task_completion(stream);
+  };
+}
+
+std::function<void()> make_synchronize_task(
+    Stream s,
+    std::shared_ptr<std::promise<void>> p) {
+  return [p = std::move(p)]() { p->set_value(); };
+}
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/common/cpu_impl.h
+++ b/mlx/backend/common/cpu_impl.h
@@ -0,0 +1,18 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <functional>
+#include <future>
+#include <memory>
+
+#include "mlx/array.h"
+
+namespace mlx::core::cpu {
+
+std::function<void()> make_task(array arr, bool signal);
+std::function<void()> make_synchronize_task(
+    Stream s,
+    std::shared_ptr<std::promise<void>> p);
+
+} // namespace mlx::core::cpu
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -5,6 +5,7 @@
 #else
 #include <cblas.h>
 #endif
+
 #include <cstring>

 #include "mlx/array.h"
@@ -33,7 +34,6 @@ DEFAULT(ArcCosh)
 DEFAULT(ArcSin)
 DEFAULT(ArcSinh)
 DEFAULT(ArcTan)
-DEFAULT(ArcTan2)
 DEFAULT(ArcTanh)
 DEFAULT(ArgPartition)
 DEFAULT(ArgReduce)
@@ -42,17 +42,15 @@ DEFAULT(AsType)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
 DEFAULT(BlockMaskedMM)
-DEFAULT(GatherMM)
-DEFAULT(GatherQMM)
+DEFAULT(BlockSparseMM)
 DEFAULT_MULTI(DivMod)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
-DEFAULT(Conjugate)
 DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
 DEFAULT(Cosh)
-DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(CustomVJP)
 DEFAULT_MULTI(Depends)
 DEFAULT(Divide)
 DEFAULT(NumberOfElements)
@@ -68,10 +66,8 @@ DEFAULT(Full)
 DEFAULT(Gather)
 DEFAULT(Greater)
 DEFAULT(GreaterEqual)
-DEFAULT(Hadamard)
 DEFAULT(Less)
 DEFAULT(LessEqual)
-DEFAULT(Load)
 DEFAULT(Log)
 DEFAULT(Log1p)
 DEFAULT(LogicalNot)
@@ -113,7 +109,6 @@ DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
-DEFAULT(Cholesky)

 namespace {

--- a/mlx/backend/common/hadamard.cpp
+++ b/mlx/backend/common/hadamard.cpp
@@ -1,107 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <cassert>
-
-#include "mlx/backend/common/copy.h"
-#include "mlx/backend/common/hadamard.h"
-#include "mlx/primitives.h"
-
-namespace mlx::core {
-
-// n = 2^k component
-template <typename T>
-void hadamard_n(array& out, int n, int m, float scale) {
-  for (int b = 0; b < out.size() / n; b++) {
-    size_t loc = b * n;
-    T* data_ptr = out.data<T>() + loc;
-    int h = 1;
-    int n_over_2 = n / 2;
-    while (h < n) {
-      for (int i = 0; i < n / 2; i++) {
-        int k = i & (h - 1);
-        int j = ((i - k) << 1) + k;
-        float x = *(data_ptr + j);
-        float y = *(data_ptr + j + h);
-        *(data_ptr + j) = x + y;
-        *(data_ptr + j + h) = x - y;
-        if (h == n_over_2) {
-          *(data_ptr + j) *= scale;
-          *(data_ptr + j + h) *= scale;
-        }
-      }
-      h <<= 1;
-    }
-  }
-}
-
-// m component
-template <typename T>
-void hadamard_m(array& out, int n, int m, float scale) {
-  auto h_matrices = hadamard_matrices();
-  auto& matrix = h_matrices[m];
-  auto start = 1;
-  auto end = matrix.find('\n', start);
-  std::vector<bool> hmat_vec;
-  while (end != std::string_view::npos) {
-    auto row = matrix.substr(start, end - start);
-    for (int i = 0; i < row.length(); i++) {
-      hmat_vec.push_back(row[i] == '+');
-    }
-    start = end + 1;
-    end = matrix.find('\n', start);
-  }
-
-  for (int b = 0; b < out.size() / m / n; b++) {
-    size_t loc = b * n * m;
-    T* data_ptr = out.data<T>() + loc;
-    for (int i = 0; i < n; i++) {
-      std::vector<float> out(m);
-      for (int j = 0; j < m; j++) {
-        for (int k = 0; k < m; k++) {
-          float x = *(data_ptr + i + k * n);
-          if (hmat_vec[k + j * m]) {
-            out[j] += x;
-          } else {
-            out[j] -= x;
-          }
-        }
-      }
-      for (int j = 0; j < m; j++) {
-        *(data_ptr + i + j * n) = out[j] * scale;
-      }
-    }
-  }
-}
-
-template <typename T>
-void hadamard(array& out, int n, int m, float scale) {
-  float n_scale = m > 1 ? 1.0 : scale;
-  hadamard_n<T>(out, n, m, n_scale);
-  if (m > 1) {
-    hadamard_m<T>(out, n, m, scale);
-  }
-}
-
-void Hadamard::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-
-  // Copy input to output
-  copy(in, out, CopyType::General);
-
-  int axis = out.ndim() - 1;
-  auto [n, m] = decompose_hadamard(out.shape(axis));
-
-  switch (in.dtype()) {
-    case float32:
-      return hadamard<float>(out, n, m, scale_);
-    case float16:
-      return hadamard<float16_t>(out, n, m, scale_);
-    case bfloat16:
-      return hadamard<bfloat16_t>(out, n, m, scale_);
-    default:
-      throw std::invalid_argument("[hadamard] Unsupported type.");
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/hadamard.h
+++ b/mlx/backend/common/hadamard.h
@@ -1,105 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#pragma once
-
-#include <map>
-
-#include "mlx/utils.h"
-
-namespace mlx::core {
-
-// From http://neilsloane.com/hadamard/
-constexpr std::string_view h12 = R"(
-+-++++++++++
--+-+-+-+-+-
-+++-++----++
-+---+--+-++-
-+++++-++----
-+-+---+--+-+
-++--+++-++--
-+--++---+--+
-++----+++-++
-+--+-++---+-
-++++----+++-
-+-+--+-++---
-)";
-
-constexpr std::string_view h20 = R"(
-+----+----++--++-++-
-+----+---+++---+-++
--+----+---+++-+-+-+
---+----+---+++++-+-
----+----++--++-++-+
-+++++-----+--+++--+
-+-+++-+---+-+--+++--
-++-++--+---+-+--+++-
-+++-+---+---+-+--+++
-++++-----++--+-+--++
--++-+-++-+-----++++
---++-+-++-+---+-+++
-+---++-+-+--+--++-++
-++---++-+----+-+++-+
-++---++-+----+++++-
-+--+--++-+----+----
-+-+-----++-+----+---
-+-+-+---+--+----+--
--+-+++------+----+-
-+--+--++------+----+
-)";
-
-constexpr std::string_view h28 = R"(
-+------++----++-+--+-+--++--
-+-----+++-----+-+--+-+--++-
--+-----+++---+-+-+----+--++
---+-----+++---+-+-+-+--+--+
----+-----+++---+-+-+++--+--
-----+-----++++--+-+--++--+-
------++----++-+--+-+--++--+
--++++-+-------++--+++-+--+-
---++++-+-----+-++--+-+-+--+
-+---+++--+----++-++--+-+-+--
-++---++---+----++-++--+-+-+-
-+++---+----+----++-++--+-+-+
-++++--------+-+--++-++--+-+-
-++++--------+++--++--+--+-+
-+-++-++--++--+--------++++-
-+-+-++--+--++--+--------++++
-+-+-++--+--++--+----+---+++
-+-+-+-++--+--+---+---++---++
-++-+-+-++--+------+--+++---+
-++-+-+-++--+------+-++++---
-+-++-+---++--+------+-++++--
-++--++-+-++-+++----++------
-+-++--++-+-++-+++-----+-----
-++-++---+-+-++-+++-----+----
-++-++-+-+-+-+--+++-----+---
--++-++++-+-+----+++-----+--
-+--++-+-++-+-+----+++-----+-
-++--++-+-++-+-+----++------+
-)";
-
-inline const std::map<int, std::string_view> hadamard_matrices() {
-  return {{12, h12}, {20, h20}, {28, h28}};
-}
-
-inline std::pair<int, int> decompose_hadamard(int n) {
-  // n = m*2^k
-  int m = 1;
-  if (!is_power_of_2(n)) {
-    auto h_matrices = hadamard_matrices();
-    for (auto [factor, _] : h_matrices) {
-      if (n % factor == 0) {
-        m = factor;
-        n /= factor;
-        break;
-      }
-    }
-    if (m == 1) {
-      throw std::invalid_argument(
-          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
-    }
-  }
-  return {n, m};
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -2,6 +2,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/linalg.h"
 #include "mlx/primitives.h"

 #ifdef ACCELERATE_NEW_LAPACK
@@ -10,106 +11,9 @@
 #include <lapack.h>
 #endif

-// Wrapper to account for differences in
-// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
-int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
-  int info;
-
-#ifdef LAPACK_FORTRAN_STRLEN_END
-  strtri_(
-      /* uplo = */ &uplo,
-      /* diag = */ &diag,
-      /* N = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info,
-      /* uplo_len = */ static_cast<size_t>(1),
-      /* diag_len = */ static_cast<size_t>(1));
-#else
-  strtri_(
-      /* uplo = */ &uplo,
-      /* diag = */ &diag,
-      /* N = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info);
-#endif
-
-  return info;
-}
-
 namespace mlx::core {

-void general_inv(array& inv, int N, int i) {
-  int info;
-  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
-  // Compute LU factorization.
-  sgetrf_(
-      /* m = */ &N,
-      /* n = */ &N,
-      /* a = */ inv.data<float>() + N * N * i,
-      /* lda = */ &N,
-      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
-      /* info = */ &info);
-
-  if (info != 0) {
-    std::stringstream ss;
-    ss << "inverse_impl: LU factorization failed with error code " << info;
-    throw std::runtime_error(ss.str());
-  }
-
-  static const int lwork_query = -1;
-  float workspace_size = 0;
-
-  // Compute workspace size.
-  sgetri_(
-      /* m = */ &N,
-      /* a = */ nullptr,
-      /* lda = */ &N,
-      /* ipiv = */ nullptr,
-      /* work = */ &workspace_size,
-      /* lwork = */ &lwork_query,
-      /* info = */ &info);
-
-  if (info != 0) {
-    std::stringstream ss;
-    ss << "inverse_impl: LU workspace calculation failed with error code "
-       << info;
-    throw std::runtime_error(ss.str());
-  }
-
-  const int lwork = workspace_size;
-  auto scratch = array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
-
-  // Compute inverse.
-  sgetri_(
-      /* m = */ &N,
-      /* a = */ inv.data<float>() + N * N * i,
-      /* lda = */ &N,
-      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
-      /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
-      /* lwork = */ &lwork,
-      /* info = */ &info);
-
-  if (info != 0) {
-    std::stringstream ss;
-    ss << "inverse_impl: inversion failed with error code " << info;
-    throw std::runtime_error(ss.str());
-  }
-}
-
-void tri_inv(array& inv, int N, int i, bool upper) {
-  const char uplo = upper ? 'L' : 'U';
-  const char diag = 'N';
-  int info = strtri_wrapper(uplo, diag, inv.data<float>() + N * N * i, N);
-  if (info != 0) {
-    std::stringstream ss;
-    ss << "inverse_impl: triangular inversion failed with error code " << info;
-    throw std::runtime_error(ss.str());
-  }
-}
-
-void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
+void inverse_impl(const array& a, array& inv) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
@@ -121,11 +25,63 @@ void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);

+  int info;
+  auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
+
  for (int i = 0; i < num_matrices; i++) {
-    if (tri) {
-      tri_inv(inv, N, i, upper);
-    } else {
-      general_inv(inv, N, i);
+    // Compute LU factorization.
+    sgetrf_(
+        /* m = */ &N,
+        /* n = */ &N,
+        /* a = */ inv.data<float>() + N * N * i,
+        /* lda = */ &N,
+        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
+        /* info = */ &info);
+
+    if (info != 0) {
+      std::stringstream ss;
+      ss << "inverse_impl: LU factorization failed with error code " << info;
+      throw std::runtime_error(ss.str());
+    }
+
+    static const int lwork_query = -1;
+    float workspace_size = 0;
+
+    // Compute workspace size.
+    sgetri_(
+        /* m = */ &N,
+        /* a = */ nullptr,
+        /* lda = */ &N,
+        /* ipiv = */ nullptr,
+        /* work = */ &workspace_size,
+        /* lwork = */ &lwork_query,
+        /* info = */ &info);
+
+    if (info != 0) {
+      std::stringstream ss;
+      ss << "inverse_impl: LU workspace calculation failed with error code "
+         << info;
+      throw std::runtime_error(ss.str());
+    }
+
+    const int lwork = workspace_size;
+    auto scratch =
+        array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
+
+    // Compute inverse.
+    sgetri_(
+        /* m = */ &N,
+        /* a = */ inv.data<float>() + N * N * i,
+        /* lda = */ &N,
+        /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
+        /* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
+        /* lwork = */ &lwork,
+        /* info = */ &info);
+
+    if (info != 0) {
+      std::stringstream ss;
+      ss << "inverse_impl: inversion failed with error code " << info;
+      throw std::runtime_error(ss.str());
    }
  }
 }
@@ -134,7 +90,15 @@ void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
-  inverse_impl(inputs[0], output, tri_, upper_);
+  inverse_impl(inputs[0], output);
+}
+
+std::pair<std::vector<array>, std::vector<int>> Inverse::vmap(
+    const std::vector<array>& inputs,
+    const std::vector<int>& axes) {
+  auto ax = axes[0] >= 0 ? 0 : -1;
+  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
+  return {{linalg::inv(a, stream())}, {ax}};
 }

 } // namespace mlx::core
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -33,7 +33,7 @@ void Load::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

-  reader_->seek(offset_);
+  reader_->seek(offset_, std::ios_base::beg);
  reader_->read(out.data<char>(), out.nbytes());

  if (swap_endianness_) {
--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -21,14 +21,13 @@ EOM

 fi

-CONTENT=$($GCC -I "$SRCDIR" -E "$SRCDIR/mlx/backend/common/compiled_preamble.h" 2>/dev/null)
+CONTENT=$($GCC -I $SRCDIR -E $SRCDIR/mlx/backend/common/compiled_preamble.h 2>/dev/null)

 cat << EOF > "$OUTPUT_FILE"
 const char* get_kernel_preamble() {
 return R"preamble(
 $INCLUDES
 $CONTENT
-using namespace mlx::core;
 using namespace mlx::core::detail;
 )preamble";
 }
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -17,25 +17,24 @@ namespace mlx::core {

 namespace {

-template <typename T, typename mask_t>
+template <typename T>
 inline void mask_matrix(
    T* data,
-    const mask_t* mask,
+    const bool* mask,
    int block_size,
    const int X,
    const int Y,
    const size_t X_data_str,
    const size_t Y_data_str,
    const size_t X_mask_str,
-    const size_t Y_mask_str,
-    const size_t mask_offset) {
+    const size_t Y_mask_str) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

  for (int i = 0; i < tX; i++) {
    for (int j = 0; j < tY; j++) {
-      mask_t do_mask = mask[mask_offset + i * X_mask_str + j * Y_mask_str];
-      if (do_mask != 1) {
+      bool do_mask = mask[i * X_mask_str + j * Y_mask_str];
+      if (!do_mask) {
        int loc_x = i * block_size;
        int loc_y = j * block_size;
        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
@@ -44,11 +43,7 @@ inline void mask_matrix(
        int size_y = std::min(block_size, Y - loc_y);
        for (int ii = 0; ii < size_x; ii++) {
          for (int jj = 0; jj < size_y; jj++) {
-            if constexpr (std::is_same_v<mask_t, bool>) {
-              data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
-            } else {
-              data_block[ii * X_data_str + jj * Y_data_str] *= do_mask;
-            }
+            data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
          }
        }
      }
@@ -67,39 +62,36 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
+  auto& out_mask = inputs[2];

-  auto check_transpose =
-      [](const array& arr, bool do_copy, bool expand_all = false) {
-        auto stx = arr.strides()[arr.ndim() - 2];
-        auto sty = arr.strides()[arr.ndim() - 1];
-        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
-          if (do_copy) {
-            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector);
-            return std::make_tuple(false, stx, arr_copy);
-          }
-          return std::make_tuple(false, stx, arr);
-        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
-          if (do_copy) {
-            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-            copy(arr, arr_copy, CopyType::Vector);
-            return std::make_tuple(true, sty, arr_copy);
-          }
-          return std::make_tuple(true, sty, arr);
-        } else {
-          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-          copy(arr, arr_copy, CopyType::General);
-          size_t stx = arr.shape(-1);
-          return std::make_tuple(false, stx, arr_copy);
-        }
-      };
+  auto check_transpose = [](const array& arr, bool do_copy) {
+    auto stx = arr.strides()[arr.ndim() - 2];
+    auto sty = arr.strides()[arr.ndim() - 1];
+    if (stx == arr.shape(-1) && sty == 1) {
+      if (do_copy) {
+        array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+        copy(arr, arr_copy, CopyType::Vector);
+        return std::make_tuple(false, stx, arr_copy);
+      }
+      return std::make_tuple(false, stx, arr);
+    } else if (stx == 1 && sty == arr.shape(-2)) {
+      if (do_copy) {
+        array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+        copy(arr, arr_copy, CopyType::Vector);
+        return std::make_tuple(true, sty, arr_copy);
+      }
+      return std::make_tuple(true, sty, arr);
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      size_t stx = arr.shape(-1);
+      return std::make_tuple(false, stx, arr_copy);
+    }
+  };

  bool has_op_mask = inputs.size() > 3;
-  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;
-  auto [a_transposed, lda, a] =
-      check_transpose(a_pre, has_op_mask, inputs.back().dtype() != bool_);
-  auto [b_transposed, ldb, b] =
-      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);
+  auto [a_transposed, lda, a] = check_transpose(a_pre, has_op_mask);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre, has_op_mask);

  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
@@ -122,42 +114,27 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str) {
-    size_t mask_offset = elem_to_loc(
-        mask.shape(-1) * mask.shape(-2) * batch_idx,
-        mask.shape(),
-        mask.strides());
+    const bool* mask_ptr = mask.data<bool>() +
+        elem_to_loc(mask.shape(-1) * mask.shape(-2) * batch_idx,
+                    mask.shape(),
+                    mask.strides());

    size_t X_mask_str = mask.strides()[mask.ndim() - 2];
    size_t Y_mask_str = mask.strides()[mask.ndim() - 1];

-    if (mask.dtype() == bool_) {
-      return mask_matrix(
-          data,
-          mask.data<bool>(),
-          block_size,
-          X,
-          Y,
-          X_data_str,
-          Y_data_str,
-          X_mask_str,
-          Y_mask_str,
-          mask_offset);
-    } else {
-      return mask_matrix(
-          data,
-          mask.data<float>(),
-          block_size,
-          X,
-          Y,
-          X_data_str,
-          Y_data_str,
-          X_mask_str,
-          Y_mask_str,
-          mask_offset);
-    }
+    return mask_matrix(
+        data,
+        mask_ptr,
+        block_size,
+        X,
+        Y,
+        X_data_str,
+        Y_data_str,
+        X_mask_str,
+        Y_mask_str);
  };

-  for (int i = 0; i < (out.size() / (M * size_t(N))); ++i) {
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
    // Adjust pointer
    float* ai =
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides());
@@ -167,7 +144,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {

    // Zero out blocks in a and b if needed
    if (has_op_mask) {
-      auto& a_mask = inputs[inputs.size() - 2];
+      auto& a_mask = inputs[3];
      mask_array(
          a_mask,
          ai,
@@ -178,7 +155,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
          a_transposed ? 1 : lda,
          a_transposed ? lda : 1);

-      auto& b_mask = inputs[inputs.size() - 1];
+      auto& b_mask = inputs[4];
      mask_array(
          b_mask,
          bi,
@@ -209,16 +186,14 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
    );

    // Zero out blocks in out
-    if (has_out_mask) {
-      mask_array(inputs[2], ci, block_size_, i, M, N, N, 1);
-    }
+    mask_array(out_mask, ci, block_size_, i, M, N, N, 1);
  }
 }

-void GatherMM::eval(const std::vector<array>& inputs, array& out) {
+void BlockSparseMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
-        "[GatherMM::eval] Currently only supports float32.");
+        "[BlockSparseMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

@@ -302,4 +277,4 @@ void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -108,146 +108,133 @@ struct Abs {
  template <typename T>
  T operator()(T x) {
    return std::abs(x);
-  }
+  };
  uint8_t operator()(uint8_t x) {
    return x;
-  }
+  };
  uint16_t operator()(uint16_t x) {
    return x;
-  }
+  };
  uint32_t operator()(uint32_t x) {
    return x;
-  }
+  };
  uint64_t operator()(uint64_t x) {
    return x;
-  }
+  };
  bool operator()(bool x) {
    return x;
-  }
+  };
 };

 struct ArcCos {
  template <typename T>
  T operator()(T x) {
    return std::acos(x);
-  }
+  };
 };

 struct ArcCosh {
  template <typename T>
  T operator()(T x) {
    return std::acosh(x);
-  }
+  };
 };

 struct ArcSin {
  template <typename T>
  T operator()(T x) {
    return std::asin(x);
-  }
+  };
 };

 struct ArcSinh {
  template <typename T>
  T operator()(T x) {
    return std::asinh(x);
-  }
+  };
 };

 struct ArcTan {
  template <typename T>
  T operator()(T x) {
    return std::atan(x);
-  }
-};
-
-struct ArcTan2 {
-  template <typename T>
-  T operator()(T y, T x) {
-    return std::atan2(y, x);
-  }
+  };
 };

 struct ArcTanh {
  template <typename T>
  T operator()(T x) {
    return std::atanh(x);
-  }
+  };
 };

 struct Ceil {
  template <typename T>
  T operator()(T x) {
    return std::ceil(x);
-  }
+  };
  int8_t operator()(int8_t x) {
    return x;
-  }
+  };
  int16_t operator()(int16_t x) {
    return x;
-  }
+  };
  int32_t operator()(int32_t x) {
    return x;
-  }
+  };
  int64_t operator()(int64_t x) {
    return x;
-  }
+  };
  uint8_t operator()(uint8_t x) {
    return x;
-  }
+  };
  uint16_t operator()(uint16_t x) {
    return x;
-  }
+  };
  uint32_t operator()(uint32_t x) {
    return x;
-  }
+  };
  uint64_t operator()(uint64_t x) {
    return x;
-  }
+  };
  bool operator()(bool x) {
    return x;
-  }
-};
-
-struct Conjugate {
-  complex64_t operator()(complex64_t x) {
-    return std::conj(x);
-  }
+  };
 };

 struct Cos {
  template <typename T>
  T operator()(T x) {
    return std::cos(x);
-  }
+  };
 };

 struct Cosh {
  template <typename T>
  T operator()(T x) {
    return std::cosh(x);
-  }
+  };
 };

 struct Erf {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erf(static_cast<float>(x)));
-  }
+  };
 };

 struct ErfInv {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(fast_erfinv(static_cast<float>(x)));
-  }
+  };
 };

 struct Exp {
  template <typename T>
  T operator()(T x) {
    return fast_exp(x);
-  }
+  };

  complex64_t operator()(complex64_t x) {
    return std::exp(x);
@@ -258,83 +245,83 @@ struct Expm1 {
  template <typename T>
  T operator()(T x) {
    return expm1(x);
-  }
+  };
 };

 struct Floor {
  template <typename T>
  T operator()(T x) {
    return std::floor(x);
-  }
+  };
  int8_t operator()(int8_t x) {
    return x;
-  }
+  };
  int16_t operator()(int16_t x) {
    return x;
-  }
+  };
  int32_t operator()(int32_t x) {
    return x;
-  }
+  };
  int64_t operator()(int64_t x) {
    return x;
-  }
+  };
  uint8_t operator()(uint8_t x) {
    return x;
-  }
+  };
  uint16_t operator()(uint16_t x) {
    return x;
-  }
+  };
  uint32_t operator()(uint32_t x) {
    return x;
-  }
+  };
  uint64_t operator()(uint64_t x) {
    return x;
-  }
+  };
  bool operator()(bool x) {
    return x;
-  }
+  };
 };

 struct Log {
  template <typename T>
  T operator()(T x) {
    return std::log(x);
-  }
+  };
 };

 struct Log2 {
  template <typename T>
  T operator()(T x) {
    return std::log2(x);
-  }
+  };
 };

 struct Log10 {
  template <typename T>
  T operator()(T x) {
    return std::log10(x);
-  }
+  };
 };

 struct Log1p {
  template <typename T>
  T operator()(T x) {
    return log1p(x);
-  }
+  };
 };

 struct LogicalNot {
  template <typename T>
  T operator()(T x) {
    return !x;
-  }
+  };
 };

 struct Negative {
  template <typename T>
  T operator()(T x) {
    return -x;
-  }
+  };
 };

 struct Round {
@@ -373,59 +360,55 @@ struct Sign {
  uint64_t operator()(uint64_t x) {
    return x != 0;
  }
-
-  complex64_t operator()(complex64_t x) {
-    return x == complex64_t(0) ? x : x / std::abs(x);
-  }
 };

 struct Sin {
  template <typename T>
  T operator()(T x) {
    return std::sin(x);
-  }
+  };
 };

 struct Sinh {
  template <typename T>
  T operator()(T x) {
    return std::sinh(x);
-  }
+  };
 };

 struct Square {
  template <typename T>
  T operator()(T x) {
    return x * x;
-  }
+  };
 };

 struct Sqrt {
  template <typename T>
  T operator()(T x) {
    return std::sqrt(x);
-  }
+  };
 };

 struct Rsqrt {
  template <typename T>
  T operator()(T x) {
    return static_cast<decltype(x)>(1.0) / std::sqrt(x);
-  }
+  };
 };

 struct Tan {
  template <typename T>
  T operator()(T x) {
    return std::tan(x);
-  }
+  };
 };

 struct Tanh {
  template <typename T>
  T operator()(T x) {
    return std::tanh(x);
-  }
+  };
 };

 struct Add {
@@ -558,7 +541,7 @@ struct LogAddExp {
        ? maxval
        : static_cast<decltype(x)>(
              maxval + std::log1p(fast_exp(minval - maxval)));
-  }
+  };
 };

 struct Multiply {
@@ -606,14 +589,14 @@ struct LogicalAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x && y;
-  }
+  };
 };

 struct LogicalOr {
  template <typename T>
  T operator()(T x, T y) {
    return x || y;
-  }
+  };
 };

 struct Select {
@@ -627,35 +610,35 @@ struct BitwiseAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x & y;
-  }
+  };
 };

 struct BitwiseOr {
  template <typename T>
  T operator()(T x, T y) {
    return x | y;
-  }
+  };
 };

 struct BitwiseXor {
  template <typename T>
  T operator()(T x, T y) {
    return x ^ y;
-  }
+  };
 };

 struct LeftShift {
  template <typename T>
  T operator()(T x, T y) {
    return x << y;
-  }
+  };
 };

 struct RightShift {
  template <typename T>
  T operator()(T x, T y) {
    return x >> y;
-  }
+  };
 };

 } // namespace mlx::core::detail
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023-2024 Apple Inc.
+// Copyright © 2023 Apple Inc.

 #include <algorithm>
 #include <cassert>
@@ -8,9 +8,9 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/arange.h"
+#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/ops.h"
-#include "mlx/backend/common/slicing.h"
 #include "mlx/backend/common/threefry.h"
 #include "mlx/backend/common/unary.h"
 #include "mlx/backend/common/utils.h"
@@ -113,6 +113,61 @@ void AsType::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, ctype);
 }

+void AsStrided::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  if (!in.flags().row_contiguous) {
+    // Just ensuring that inputs[0] came from the ops which would ensure the
+    // input is row contiguous.
+    throw std::runtime_error(
+        "AsStrided must be used with row contiguous arrays only.");
+  }
+
+  // Compute the flags given the shape and strides
+  bool row_contiguous = true, col_contiguous = true;
+  size_t r = 1, c = 1;
+  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
+    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
+    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
+    r *= shape_[i];
+    c *= shape_[j];
+  }
+  auto flags = in.flags();
+  // TODO: Compute the contiguous flag in a better way cause now we are
+  //       unnecessarily strict.
+  flags.contiguous = row_contiguous || col_contiguous;
+  flags.row_contiguous = row_contiguous;
+  flags.col_contiguous = col_contiguous;
+
+  // There is no easy way to compute the actual data size so we use out.size().
+  // The contiguous flag will almost certainly not be set so no code should
+  // rely on data_size anyway.
+  size_t data_size = out.size();
+
+  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
+}
+
+void Broadcast::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  std::vector<size_t> strides(out.ndim(), 0);
+  int diff = out.ndim() - in.ndim();
+  for (int i = in.ndim() - 1; i >= 0; --i) {
+    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
+  }
+  auto flags = in.flags();
+  if (out.size() > in.size()) {
+    flags.row_contiguous = flags.col_contiguous = false;
+  }
+  out.copy_shared_buffer(in, strides, flags, in.data_size());
+}
+
 void Ceil::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -148,15 +203,9 @@ void Concatenate::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void Conjugate::eval(const std::vector<array>& inputs, array& out) {
+void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  if (out.dtype() == complex64) {
-    unary_fp(in, out, detail::Conjugate());
-  } else {
-    throw std::invalid_argument(
-        "[conjugate] conjugate must be called on complex input.");
-  }
+  out.copy_shared_buffer(inputs[0]);
 }

 void Cos::eval(const std::vector<array>& inputs, array& out) {
@@ -183,6 +232,81 @@ void Cosh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void CustomVJP::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+       i++, j++) {
+    outputs[i].copy_shared_buffer(inputs[j]);
+  }
+}
+
+void Depends::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0; i < outputs.size(); i++) {
+    outputs[i].copy_shared_buffer(inputs[i]);
+  }
+}
+
+void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  double numel = 1;
+  for (auto ax : axes_) {
+    numel *= inputs[0].shape(ax);
+  }
+
+  if (inverted_) {
+    numel = 1.0 / numel;
+  }
+
+  switch (out.dtype()) {
+    case bool_:
+      *out.data<bool>() = static_cast<bool>(numel);
+      break;
+    case uint8:
+      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
+      break;
+    case uint16:
+      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
+      break;
+    case uint32:
+      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
+      break;
+    case uint64:
+      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
+      break;
+    case int8:
+      *out.data<int8_t>() = static_cast<int8_t>(numel);
+      break;
+    case int16:
+      *out.data<int16_t>() = static_cast<int16_t>(numel);
+      break;
+    case int32:
+      *out.data<int32_t>() = static_cast<int32_t>(numel);
+      break;
+    case int64:
+      *out.data<int64_t>() = static_cast<int64_t>(numel);
+      break;
+    case float16:
+      *out.data<float16_t>() = static_cast<float16_t>(numel);
+      break;
+    case float32:
+      *out.data<float>() = static_cast<float>(numel);
+      break;
+    case bfloat16:
+      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
+      break;
+    case complex64:
+      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
+      break;
+  }
+}
+
 void Erf::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -313,6 +437,20 @@ void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
  unary(in, out, detail::LogicalNot());
 }

+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, detail::LogicalAnd());
+}
+
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalOr requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, detail::LogicalOr());
+}
+
 void Negative::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -398,6 +536,63 @@ void RandomBits::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
+    const array& in,
+    const array& out) {
+  // Special case for empty arrays or row contiguous arrays
+  if (in.size() == 0 || in.flags().row_contiguous) {
+    return {false, out.strides()};
+  }
+
+  // Special case for scalars
+  if (in.ndim() == 0) {
+    std::vector<size_t> out_strides(out.ndim(), 0);
+    return {false, out_strides};
+  }
+
+  // Firstly let's collapse all the contiguous dimensions of the input
+  auto [shape, _strides] = collapse_contiguous_dims(in);
+  auto& strides = _strides[0];
+
+  // If shapes fit exactly in the contiguous dims then no copy is necessary so
+  // let's check.
+  std::vector<size_t> out_strides;
+  bool copy_necessary = false;
+  int j = 0;
+  for (int i = 0; i < out.ndim(); i++) {
+    int N = out.shape(i);
+    if (j < shape.size() && shape[j] % N == 0) {
+      shape[j] /= N;
+      out_strides.push_back(shape[j] * strides[j]);
+      j += (shape[j] == 1);
+    } else if (N == 1) {
+      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
+      out_strides.push_back(out_strides.back());
+    } else {
+      copy_necessary = true;
+      break;
+    }
+  }
+
+  return {copy_necessary, out_strides};
+}
+
+void Reshape::shared_buffer_reshape(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    array& out) {
+  auto flags = in.flags();
+  if (flags.row_contiguous) {
+    // For row contiguous reshapes:
+    // - Shallow copy the buffer
+    // - If reshaping into a vector (all singleton dimensions except one) it
+    //    becomes col contiguous again.
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+  }
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+}
+
 void Reshape::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -405,17 +600,7 @@ void Reshape::eval(const std::vector<array>& inputs, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);

  if (copy_necessary) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    auto out_strides = make_contiguous_strides<size_t>(in.shape());
-    copy_inplace<size_t>(
-        in,
-        out,
-        in.shape(),
-        in.strides(),
-        out_strides,
-        0,
-        0,
-        CopyType::General);
+    copy(in, out, in.data_size() == 1 ? CopyType::Scalar : CopyType::General);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -478,6 +663,49 @@ void Sinh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+std::tuple<bool, int64_t, std::vector<int64_t>> Slice::prepare_slice(
+    const array& in) {
+  int64_t data_offset = 0;
+  bool copy_needed = false;
+  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  for (int i = 0; i < in.ndim(); ++i) {
+    data_offset += start_indices_[i] * in.strides()[i];
+    inp_strides[i] = in.strides()[i] * strides_[i];
+
+    copy_needed |= strides_[i] < 0;
+  }
+
+  return std::make_tuple(copy_needed, data_offset, inp_strides);
+}
+
+void Slice::shared_buffer_slice(
+    const array& in,
+    const std::vector<size_t>& out_strides,
+    size_t data_offset,
+    array& out) {
+  // Compute row/col contiguity
+  auto [data_size, is_row_contiguous, is_col_contiguous] =
+      check_contiguity(out.shape(), out_strides);
+
+  auto flags = in.flags();
+  flags.row_contiguous = is_row_contiguous;
+  flags.col_contiguous = is_col_contiguous;
+
+  if (data_size == 1) {
+    // Broadcasted scalar array is contiguous.
+    flags.contiguous = true;
+  } else if (data_size == in.data_size()) {
+    // Means we sliced a broadcasted dimension so leave the "no holes" flag
+    // alone.
+  } else {
+    // We sliced something. So either we are row or col contiguous or we
+    // punched a hole.
+    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+  }
+
+  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
+}
+
 void Slice::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  if (out.size() == 0) {
@@ -488,8 +716,7 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] =
-      prepare_slice(in, start_indices_, strides_);
+  auto [copy_needed, data_offset, inp_strides] = prepare_slice(in);

  // Do copy if needed
  if (copy_needed) {
@@ -510,6 +737,18 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
+    const array& in) {
+  int64_t data_offset = 0;
+  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  for (int i = 0; i < in.ndim(); ++i) {
+    data_offset += start_indices_[i] * in.strides()[i];
+    inp_strides[i] = in.strides()[i] * strides_[i];
+  }
+
+  return std::make_tuple(data_offset, inp_strides);
+}
+
 void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
@@ -547,6 +786,58 @@ void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
      /* CopyType ctype = */ CopyType::GeneralGeneral);
 }

+void Split::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  auto compute_new_flags = [](const auto& shape,
+                              const auto& strides,
+                              size_t in_data_size,
+                              auto flags) {
+    size_t data_size = 1;
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.row_contiguous = true;
+    flags.col_contiguous = true;
+    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
+      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
+      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
+      f_stride *= shape[i];
+      b_stride *= shape[ri];
+      if (strides[i] > 0) {
+        data_size *= shape[i];
+      }
+    }
+
+    if (data_size == 1) {
+      // Broadcasted scalar array is contiguous.
+      flags.contiguous = true;
+    } else if (data_size == in_data_size) {
+      // Means we sliced a broadcasted dimension so leave the "no holes" flag
+      // alone.
+    } else {
+      // We sliced something. So either we are row or col contiguous or we
+      // punched a hole.
+      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+    }
+
+    return std::pair<decltype(flags), size_t>{flags, data_size};
+  };
+
+  std::vector<int> indices(1, 0);
+  indices.insert(indices.end(), indices_.begin(), indices_.end());
+  for (int i = 0; i < indices.size(); i++) {
+    size_t offset = indices[i] * in.strides()[axis_];
+    auto [new_flags, data_size] = compute_new_flags(
+        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
+    outputs[i].copy_shared_buffer(
+        in, in.strides(), new_flags, data_size, offset);
+  }
+}
+
 void Square::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -563,6 +854,11 @@ void Sqrt::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void StopGradient::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  out.copy_shared_buffer(inputs[0]);
+}
+
 void Tan::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -587,36 +883,38 @@ void Tanh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-void View::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
+  std::vector<size_t> out_strides(out.ndim());
  auto& in = inputs[0];
-  auto ibytes = size_of(in.dtype());
-  auto obytes = size_of(out.dtype());
-  // Conditions for buffer copying (disjunction):
-  // - type size is the same
-  // - type size is smaller and the last axis is contiguous
-  // - the entire array is row contiguous
-  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
-      in.flags().row_contiguous) {
-    auto strides = in.strides();
-    for (int i = 0; i < strides.size() - 1; ++i) {
-      strides[i] *= ibytes;
-      strides[i] /= obytes;
-    }
-    out.copy_shared_buffer(
-        in, strides, in.flags(), in.data_size() * obytes / ibytes);
-  } else {
-    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
-    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
-    copy_inplace(in, tmp, CopyType::General);
-
-    auto flags = out.flags();
-    flags.contiguous = true;
-    flags.row_contiguous = true;
-    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
-    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
-    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
+  for (int ax = 0; ax < axes_.size(); ++ax) {
+    out_strides[ax] = in.strides()[axes_[ax]];
  }
+
+  // Conditions for {row/col}_contiguous
+  // - array must be contiguous (no gaps)
+  // - underlying buffer size should have the same size as the array
+  // - cumulative product of shapes is equal to the strides (we can ignore axes
+  //   with size == 1)
+  //   - in the forward direction (column contiguous)
+  //   - in the reverse direction (row contiguous)
+  // - vectors are both row and col contiguous (hence if both row/col are
+  //   true, they stay true)
+  auto flags = in.flags();
+  if (flags.contiguous && in.data_size() == in.size()) {
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.col_contiguous = true;
+    flags.row_contiguous = true;
+    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
+      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
+      f_stride *= out.shape(i);
+      flags.row_contiguous &=
+          (out_strides[ri] == b_stride || out.shape(ri) == 1);
+      b_stride *= out.shape(ri);
+    }
+  }
+  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
 }

 } // namespace mlx::core
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -192,7 +192,7 @@ void _qmm_dispatch_typed(
 }

 void _qmm_dispatch(
-    array& out,
+    array out,
    const array& x,
    const array& w,
    const array& scales,
@@ -253,81 +253,6 @@ void _qmm_dispatch(
  }
 }

-void _bs_qmm_dispatch(
-    array& out,
-    const array& x,
-    const array& w,
-    const array& scales,
-    const array& biases,
-    const array& lhs_indices,
-    const array& rhs_indices,
-    int bits,
-    int group_size,
-    bool transposed_w) {
-  int K = x.shape(-1);
-  int M = x.shape(-2);
-  int N = out.shape(-1);
-
-  int w_els = w.shape(-1) * w.shape(-2);
-  int g_els = scales.shape(-1) * scales.shape(-2);
-
-  const uint32_t* lhs_indices_data = lhs_indices.data<uint32_t>();
-  const uint32_t* rhs_indices_data = rhs_indices.data<uint32_t>();
-
-  for (int i = 0; i < lhs_indices.size(); i++) {
-    int x_idx = lhs_indices_data[elem_to_loc(i, lhs_indices)];
-    int w_idx = rhs_indices_data[elem_to_loc(i, rhs_indices)];
-
-    switch (x.dtype()) {
-      case float32:
-        _qmm_dispatch_typed<float>(
-            out.data<float>() + i * M * N,
-            x.data<float>() + elem_to_loc(x_idx * M * K, x),
-            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
-            scales.data<float>() + elem_to_loc(w_idx * g_els, scales),
-            biases.data<float>() + elem_to_loc(w_idx * g_els, biases),
-            M,
-            N,
-            K,
-            bits,
-            group_size,
-            transposed_w);
-        break;
-      case float16:
-        _qmm_dispatch_typed<float16_t>(
-            out.data<float16_t>() + i * M * N,
-            x.data<float16_t>() + elem_to_loc(x_idx * M * K, x),
-            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
-            scales.data<float16_t>() + elem_to_loc(w_idx * g_els, scales),
-            biases.data<float16_t>() + elem_to_loc(w_idx * g_els, biases),
-            M,
-            N,
-            K,
-            bits,
-            group_size,
-            transposed_w);
-        break;
-      case bfloat16:
-        _qmm_dispatch_typed<bfloat16_t>(
-            out.data<bfloat16_t>() + i * M * N,
-            x.data<bfloat16_t>() + elem_to_loc(x_idx * M * K, x),
-            w.data<uint32_t>() + elem_to_loc(w_idx * w_els, w),
-            scales.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, scales),
-            biases.data<bfloat16_t>() + elem_to_loc(w_idx * g_els, biases),
-            M,
-            N,
-            K,
-            bits,
-            group_size,
-            transposed_w);
-        break;
-      default:
-        throw std::invalid_argument(
-            "[quantized_matmul] only floating types are supported");
-    }
-  }
-}
-
 } // namespace

 void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
@@ -357,45 +282,4 @@ void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }

-void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 6);
-
-  auto& x_pre = inputs[0];
-  auto& w_pre = inputs[1];
-  auto& scales_pre = inputs[2];
-  auto& biases_pre = inputs[3];
-  auto& lhs_indices = inputs[4];
-  auto& rhs_indices = inputs[5];
-
-  auto ensure_row_contiguous_last_dims = [](const array& arr) {
-    auto stride_0 = arr.strides()[arr.ndim() - 2];
-    auto stride_1 = arr.strides()[arr.ndim() - 1];
-    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
-      return arr;
-    } else {
-      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy(arr, arr_copy, CopyType::General);
-      return arr_copy;
-    }
-  };
-
-  auto x = ensure_row_contiguous_last_dims(x_pre);
-  auto w = ensure_row_contiguous_last_dims(w_pre);
-  auto scales = ensure_row_contiguous_last_dims(scales_pre);
-  auto biases = ensure_row_contiguous_last_dims(biases_pre);
-
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  _bs_qmm_dispatch(
-      out,
-      x,
-      w,
-      scales,
-      biases,
-      lhs_indices,
-      rhs_indices,
-      group_size_,
-      bits_,
-      transpose_);
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -87,38 +87,6 @@ struct OrReduce {
  }
 };

-struct MaxReduce {
-  template <typename T>
-  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
-    (*y) = (*y > x) ? *y : x;
-  };
-
-  template <typename T>
-  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
-    if (std::isnan(x)) {
-      *y = x;
-    } else {
-      (*y) = (*y > x) ? *y : x;
-    }
-  };
-};
-
-struct MinReduce {
-  template <typename T>
-  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
-    (*y) = (*y < x) ? *y : x;
-  };
-
-  template <typename T>
-  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
-    if (std::isnan(x)) {
-      *y = x;
-    } else {
-      (*y) = (*y < x) ? *y : x;
-    }
-  };
-};
-
 template <typename InT>
 void reduce_dispatch_out(
    const array& in,
@@ -136,27 +104,63 @@ void reduce_dispatch_out(
    }
    case Reduce::Sum: {
      auto op = [](auto y, auto x) { (*y) = (*y) + x; };
-      if (out.dtype() == int32) {
-        // special case since the input type can be bool
-        reduction_op<InT, int32_t>(in, out, axes, 0, op);
-      } else {
-        reduction_op<InT, InT>(in, out, axes, 0, op);
+      switch (out.dtype()) {
+        case bool_:
+          reduction_op<InT, bool>(in, out, axes, false, op);
+          break;
+        case uint8:
+          reduction_op<InT, uint8_t>(in, out, axes, 0, op);
+          break;
+        case uint16:
+          reduction_op<InT, uint16_t>(in, out, axes, 0, op);
+          break;
+        case uint32:
+          reduction_op<InT, uint32_t>(in, out, axes, 0, op);
+          break;
+        case uint64:
+          reduction_op<InT, uint64_t>(in, out, axes, 0, op);
+          break;
+        case int8:
+          reduction_op<InT, int8_t>(in, out, axes, 0, op);
+          break;
+        case int16:
+          reduction_op<InT, int16_t>(in, out, axes, 0, op);
+          break;
+        case int32:
+          reduction_op<InT, int32_t>(in, out, axes, 0, op);
+          break;
+        case int64:
+          reduction_op<InT, int64_t>(in, out, axes, 0, op);
+          break;
+        case float16:
+          reduction_op<InT, float16_t>(in, out, axes, 0.0f, op);
+          break;
+        case float32:
+          reduction_op<InT, float>(in, out, axes, 0.0f, op);
+          break;
+        case bfloat16:
+          reduction_op<InT, bfloat16_t>(in, out, axes, 0.0f, op);
+          break;
+        case complex64:
+          reduction_op<InT, complex64_t>(in, out, axes, complex64_t{0.0f}, op);
+          break;
      }
-      break;
-    }
+    } break;
    case Reduce::Prod: {
      auto op = [](auto y, auto x) { (*y) *= x; };
      reduction_op<InT, InT>(in, out, axes, 1, op);
      break;
    }
    case Reduce::Max: {
+      auto op = [](auto y, auto x) { (*y) = (*y > x) ? *y : x; };
      auto init = Limits<InT>::min;
-      reduction_op<InT, InT>(in, out, axes, init, MaxReduce());
+      reduction_op<InT, InT>(in, out, axes, init, op);
      break;
    }
    case Reduce::Min: {
+      auto op = [](auto y, auto x) { (*y) = (*y < x) ? *y : x; };
      auto init = Limits<InT>::max;
-      reduction_op<InT, InT>(in, out, axes, init, MinReduce());
+      reduction_op<InT, InT>(in, out, axes, init, op);
      break;
    }
  }
@@ -164,29 +168,6 @@ void reduce_dispatch_out(

 } // namespace

-void nd_loop(
-    std::function<void(int)> callback,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
-  std::function<void(int, int)> loop_inner;
-  loop_inner = [&](int dim, int offset) {
-    if (dim < shape.size() - 1) {
-      int size = shape[dim];
-      size_t stride = strides[dim];
-      for (int i = 0; i < size; i++) {
-        loop_inner(dim + 1, offset + i * stride);
-      }
-    } else {
-      int size = shape[dim];
-      size_t stride = strides[dim];
-      for (int i = 0; i < size; i++) {
-        callback(offset + i * stride);
-      }
-    }
-  };
-  loop_inner(0, 0);
-}
-
 void Reduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -49,18 +49,47 @@ struct ReductionPlan {
  ReductionPlan(ReductionOpType type_) : type(type_) {}
 };

-ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
+namespace {

 // Helper for the ndimensional strided loop
 // Should this be in utils?
-void nd_loop(
+inline void nd_loop(
    std::function<void(int)> callback,
    const std::vector<int>& shape,
-    const std::vector<size_t>& strides);
+    const std::vector<size_t>& strides) {
+  std::function<void(int, int)> loop_inner;
+  loop_inner = [&](int dim, int offset) {
+    if (dim < shape.size() - 1) {
+      int size = shape[dim];
+      size_t stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        loop_inner(dim + 1, offset + i * stride);
+      }
+    } else {
+      int size = shape[dim];
+      size_t stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        callback(offset + i * stride);
+      }
+    }
+  };
+  loop_inner(0, 0);
+}

 std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
    const array& x,
-    const std::vector<int>& axes);
+    const std::vector<int>& axes) {
+  std::vector<int> shape = x.shape();
+  std::vector<size_t> strides = x.strides();
+
+  for (int i = axes.size() - 1; i >= 0; i--) {
+    int a = axes[i];
+    shape.erase(shape.begin() + a);
+    strides.erase(strides.begin() + a);
+  }
+
+  return std::make_pair(shape, strides);
+}

 template <typename T, typename U, typename Op>
 struct DefaultStridedReduce {
@@ -94,6 +123,102 @@ struct DefaultContiguousReduce {
  }
 };

+ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
+  // The data is all there and we are reducing over everything
+  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+      x.flags().contiguous) {
+    return ContiguousAllReduce;
+  }
+
+  // Row contiguous input so the output is row contiguous
+  if (x.flags().row_contiguous) {
+    // Merge consecutive axes
+    std::vector<int> shape = {x.shape(axes[0])};
+    std::vector<size_t> strides = {x.strides()[axes[0]]};
+    for (int i = 1; i < axes.size(); i++) {
+      if (axes[i] - 1 == axes[i - 1]) {
+        shape.back() *= x.shape(axes[i]);
+        strides.back() = x.strides()[axes[i]];
+      } else {
+        shape.push_back(x.shape(axes[i]));
+        strides.push_back(x.strides()[axes[i]]);
+      }
+    }
+
+    if (strides.back() == 1) {
+      return ReductionPlan(ContiguousReduce, shape, strides);
+    } else if (strides.back() > 1) {
+      return ReductionPlan(ContiguousStridedReduce, shape, strides);
+    }
+  }
+
+  // Let's check if we can optimize our access patterns
+  //
+  // 1. We have a reduction axis with stride 1. Simply call
+  //    GeneralContiguousReduce and be done with it.
+  // 2. We have transpositions and we are not reducing over the axis with
+  //    stride 1. However, we are reducing over an axis where everything is
+  //    contiguous in memory to the right of that axis. We can call strided
+  //    reduce and be done with it.
+  // 2. We have weird transpositions and expands. Copy the strides to the
+  //    output, then call strided reduce.
+
+  // Sort reduction axes by stride in order to merge them and figure out if we
+  // have a contiguous reduction.
+  std::vector<std::pair<int, size_t>> reductions;
+  for (auto a : axes) {
+    reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
+  }
+  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
+    return a.second > b.second;
+  });
+  // Extract the two smallest and try to merge them in case the contiguous
+  // reduction can be bigger than just the last axis.
+  for (int i = reductions.size() - 1; i >= 1; i--) {
+    auto a = reductions[i];
+    auto b = reductions[i - 1];
+
+    // b.stride = a.shape * a.stride then a and b are contiguous
+    if (b.second == a.first * a.second) {
+      reductions.erase(reductions.begin() + i);
+      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
+    }
+  }
+
+  std::vector<int> shape;
+  std::vector<size_t> strides;
+  for (auto r : reductions) {
+    shape.push_back(r.first);
+    strides.push_back(r.second);
+  }
+
+  // We can call the contiguous reduction op for every weird way the input is
+  // structured in the rest of the axes.
+  if (strides.back() == 1) {
+    return ReductionPlan(GeneralContiguousReduce, shape, strides);
+  }
+
+  // Delegate to the general strided reduction op if the axes after
+  // strides.back() are contiguous.
+  if (strides.back() > 1) {
+    int size = 1;
+    for (int i = x.ndim() - 1; i >= 0; i--) {
+      if (axes.back() == i) {
+        continue;
+      }
+      if (x.strides()[i] != size) {
+        break;
+      }
+      size *= x.shape(i);
+    }
+    if (size >= strides.back()) {
+      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    }
+  }
+
+  return ReductionPlan(GeneralReduce, shape, strides);
+}
+
 template <typename T, typename U, typename OpS, typename OpC, typename Op>
 void reduction_op(
    const array& x,
@@ -236,4 +361,6 @@ void reduction_op(
  reduction_op<T, U>(x, out, axes, init, ops, opc, op);
 }

+} // namespace
+
 } // namespace mlx::core
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -1,147 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/backend/common/reduce.h"
-
-namespace mlx::core {
-
-std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
-    const array& x,
-    const std::vector<int>& axes) {
-  std::vector<int> shape = x.shape();
-  std::vector<size_t> strides = x.strides();
-
-  for (int i = axes.size() - 1; i >= 0; i--) {
-    int a = axes[i];
-    shape.erase(shape.begin() + a);
-    strides.erase(strides.begin() + a);
-  }
-
-  return std::make_pair(shape, strides);
-}
-
-ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
-  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
-      x.flags().contiguous) {
-    return ContiguousAllReduce;
-  }
-
-  // Row contiguous input so the output is row contiguous
-  if (x.flags().row_contiguous) {
-    // Merge consecutive axes
-    std::vector<int> shape = {x.shape(axes[0])};
-    std::vector<size_t> strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
-      if (axes[i] - 1 == axes[i - 1]) {
-        shape.back() *= x.shape(axes[i]);
-        strides.back() = x.strides()[axes[i]];
-      } else {
-        shape.push_back(x.shape(axes[i]));
-        strides.push_back(x.strides()[axes[i]]);
-      }
-    }
-
-    // Remove singleton axes from the plan
-    for (int i = shape.size() - 1; i >= 0; i--) {
-      if (shape[i] == 1) {
-        shape.erase(shape.begin() + i);
-        strides.erase(strides.begin() + i);
-      }
-    }
-
-    if (strides.back() == 1) {
-      return ReductionPlan(ContiguousReduce, shape, strides);
-    } else if (strides.back() > 1) {
-      return ReductionPlan(ContiguousStridedReduce, shape, strides);
-    }
-  }
-
-  // Let's check if we can optimize our access patterns
-  //
-  // 1. We have a reduction axis with stride 1. Simply call
-  //    GeneralContiguousReduce and be done with it.
-  // 2. We have transpositions and we are not reducing over the axis with
-  //    stride 1. However, we are reducing over an axis where everything is
-  //    contiguous in memory to the right of that axis. We can call strided
-  //    reduce and be done with it.
-  // 2. We have weird transpositions and expands. Copy the strides to the
-  //    output, then call strided reduce.
-
-  // Sort reduction axes by stride in order to merge them and figure out if we
-  // have a contiguous reduction.
-  std::vector<std::pair<int, size_t>> reductions;
-  for (auto a : axes) {
-    if (x.shape(a) > 1) {
-      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
-    }
-  }
-  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
-    bool a_is_zero = a.second == 0;
-    bool b_is_zero = b.second == 0;
-    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
-  });
-  // Extract the two smallest and try to merge them in case the contiguous
-  // reduction can be bigger than just the last axis.
-  for (int i = reductions.size() - 1; i >= 1; i--) {
-    auto a = reductions[i];
-    auto b = reductions[i - 1];
-
-    // b.stride = a.shape * a.stride then a and b are contiguous
-    if (b.second == a.first * a.second) {
-      reductions.erase(reductions.begin() + i);
-      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
-    }
-  }
-
-  std::vector<int> shape;
-  std::vector<size_t> strides;
-  for (auto r : reductions) {
-    shape.push_back(r.first);
-    strides.push_back(r.second);
-  }
-
-  // We can call the contiguous reduction op for every weird way the input is
-  // structured in the rest of the axes.
-  if (strides.back() == 1) {
-    return ReductionPlan(GeneralContiguousReduce, shape, strides);
-  }
-
-  // Delegate to the general strided reduction op if the axes after
-  // strides.back() are contiguous.
-  if (strides.back() > 1) {
-    int size = 1;
-    bool have_expand = false;
-    for (int i = x.ndim() - 1; i >= 0; i--) {
-      if (axes.back() == i) {
-        continue;
-      }
-
-      size_t stride_i = x.strides()[i];
-      int shape_i = x.shape(i);
-      if (stride_i == 0) {
-        if (shape_i == 1) {
-          continue;
-        }
-
-        have_expand = true;
-        break;
-      }
-
-      if (stride_i != size && shape_i != 1) {
-        break;
-      }
-      size *= shape_i;
-    }
-    // In the case of an expanded dimension we are being conservative and
-    // require the smallest reduction stride to be smaller than the maximum row
-    // contiguous size. The reason is that we can't easily know if the reduced
-    // axis is before or after an expanded dimension.
-    if (size > strides.back() || (size == strides.back() && !have_expand)) {
-      return ReductionPlan(GeneralStridedReduce, shape, strides);
-    }
-  }
-
-  return ReductionPlan(GeneralReduce, shape, strides);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/scan.cpp
+++ b/mlx/backend/common/scan.cpp
@@ -234,7 +234,7 @@ void scan_dispatch(
      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *y : *x; };
      auto init = (issubdtype(input.dtype(), floating))
          ? static_cast<U>(-std::numeric_limits<float>::infinity())
-          : std::numeric_limits<U>::min();
+          : std::numeric_limits<U>::max();
      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -1,52 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/backend/common/utils.h"
-
-namespace mlx::core {
-
-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
-    const array& in,
-    std::vector<int>& start_indices,
-    std::vector<int>& strides) {
-  int64_t data_offset = 0;
-  bool copy_needed = false;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides[i];
-
-    copy_needed |= strides[i] < 0;
-  }
-
-  return std::make_tuple(copy_needed, data_offset, inp_strides);
-}
-
-void shared_buffer_slice(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    size_t data_offset,
-    array& out) {
-  // Compute row/col contiguity
-  auto [data_size, is_row_contiguous, is_col_contiguous] =
-      check_contiguity(out.shape(), out_strides);
-
-  auto flags = in.flags();
-  flags.row_contiguous = is_row_contiguous;
-  flags.col_contiguous = is_col_contiguous;
-
-  if (data_size == 1) {
-    // Broadcasted scalar array is contiguous.
-    flags.contiguous = true;
-  } else if (data_size == in.data_size()) {
-    // Means we sliced a broadcasted dimension so leave the "no holes" flag
-    // alone.
-  } else {
-    // We sliced something. So either we are row or col contiguous or we
-    // punched a hole.
-    flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
-  }
-
-  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -1,20 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#pragma once
-
-#include "mlx/array.h"
-
-namespace mlx::core {
-
-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
-    const array& in,
-    std::vector<int>& start_indices,
-    std::vector<int>& strides);
-
-void shared_buffer_slice(
-    const array& in,
-    const std::vector<size_t>& out_strides,
-    size_t data_offset,
-    array& out);
-
-} // namespace mlx::core
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -113,14 +113,14 @@ void sort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

-  auto remaining_shape = out.shape();
+  auto remaining_shape = in.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);

-  auto remaining_strides = out.strides();
+  auto remaining_strides = in.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = out.strides()[axis];
-  int axis_size = out.shape(axis);
+  size_t axis_stride = in.strides()[axis];
+  int axis_size = in.shape(axis);

  // Perform sorting in place
  for (int i = 0; i < n_rows; i++) {
@@ -143,42 +143,34 @@ void argsort(const array& in, array& out, int axis) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

-  auto in_remaining_shape = in.shape();
-  in_remaining_shape.erase(in_remaining_shape.begin() + axis);
+  auto remaining_shape = in.shape();
+  remaining_shape.erase(remaining_shape.begin() + axis);

-  auto in_remaining_strides = in.strides();
-  in_remaining_strides.erase(in_remaining_strides.begin() + axis);
+  auto remaining_strides = in.strides();
+  remaining_strides.erase(remaining_strides.begin() + axis);

-  auto out_remaining_shape = out.shape();
-  out_remaining_shape.erase(out_remaining_shape.begin() + axis);
-
-  auto out_remaining_strides = out.strides();
-  out_remaining_strides.erase(out_remaining_strides.begin() + axis);
-
-  size_t in_stride = in.strides()[axis];
-  size_t out_stride = out.strides()[axis];
+  size_t axis_stride = in.strides()[axis];
  int axis_size = in.shape(axis);

  // Perform sorting
  for (int i = 0; i < n_rows; i++) {
-    size_t in_loc = elem_to_loc(i, in_remaining_shape, in_remaining_strides);
-    size_t out_loc = elem_to_loc(i, out_remaining_shape, out_remaining_strides);
-    const T* data_ptr = in.data<T>() + in_loc;
-    IdxT* idx_ptr = out.data<IdxT>() + out_loc;
+    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    const T* data_ptr = in.data<T>() + loc;
+    IdxT* idx_ptr = out.data<IdxT>() + loc;

-    StridedIterator st_(idx_ptr, out_stride, 0);
-    StridedIterator ed_(idx_ptr, out_stride, axis_size);
+    StridedIterator st_(idx_ptr, axis_stride, 0);
+    StridedIterator ed_(idx_ptr, axis_stride, axis_size);

    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));

    // Sort according to vals
-    StridedIterator st(idx_ptr, out_stride, 0);
-    StridedIterator ed(idx_ptr, out_stride, axis_size);
+    StridedIterator st(idx_ptr, axis_stride, 0);
+    StridedIterator ed(idx_ptr, axis_stride, axis_size);

-    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
-      auto v1 = data_ptr[a * in_stride];
-      auto v2 = data_ptr[b * in_stride];
+    std::stable_sort(st, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
+      auto v1 = data_ptr[a * axis_stride];
+      auto v2 = data_ptr[b * axis_stride];
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -3,6 +3,7 @@
 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/lapack_helper.h"
+#include "mlx/linalg.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -144,4 +145,12 @@ void SVD::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  svd_impl(inputs[0], outputs[0], outputs[1], outputs[2]);
 }

+std::pair<std::vector<array>, std::vector<int>> SVD::vmap(
+    const std::vector<array>& inputs,
+    const std::vector<int>& axes) {
+  auto ax = axes[0] >= 0 ? 0 : -1;
+  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
+  return {{linalg::svd(a, stream())}, {ax, ax, ax}};
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -29,15 +29,6 @@ inline size_t elem_to_loc(int elem, const array& a) {
  return elem_to_loc(elem, a.shape(), a.strides());
 }

-template <typename stride_t>
-std::vector<stride_t> make_contiguous_strides(const std::vector<int>& shape) {
-  std::vector<stride_t> strides(shape.size(), 1);
-  for (int i = shape.size() - 1; i > 0; i--) {
-    strides[i - 1] = strides[i] * shape[i];
-  }
-  return strides;
-}
-
 // Collapse dims that are contiguous to possibly route to a better kernel
 // e.g. for x = transpose(array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2}), {2, 0, 1})
 // should return {{2, 4}, {{1, 2}}}.
@@ -104,33 +95,6 @@ inline auto collapse_contiguous_dims(Arrays&&... xs) {
      std::vector<array>{std::forward<Arrays>(xs)...});
 }

-// The single array version of the above.
-inline std::tuple<std::vector<int>, std::vector<size_t>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
-  std::vector<int> collapsed_shape;
-  std::vector<size_t> collapsed_strides;
-
-  if (shape.size() > 0) {
-    collapsed_shape.push_back(shape[0]);
-    collapsed_strides.push_back(strides[0]);
-    for (int i = 1; i < shape.size(); i++) {
-      if (strides[i] * shape[i] != collapsed_strides.back() ||
-          collapsed_shape.back() * static_cast<size_t>(shape[i]) >
-              std::numeric_limits<int>::max()) {
-        collapsed_shape.push_back(shape[i]);
-        collapsed_strides.push_back(strides[i]);
-      } else {
-        collapsed_shape.back() *= shape[i];
-        collapsed_strides.back() = strides[i];
-      }
-    }
-  }
-
-  return std::make_tuple(collapsed_shape, collapsed_strides);
-}
-
 template <typename stride_t>
 inline auto check_contiguity(
    const std::vector<int>& shape,
@@ -142,8 +106,8 @@ inline auto check_contiguity(
  bool is_col_contiguous = true;

  for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
-    is_col_contiguous &= strides[i] == f_stride || shape[i] == 1;
-    is_row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
+    is_row_contiguous &= strides[i] == f_stride || shape[i] == 1;
+    is_col_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
    f_stride *= shape[i];
    b_stride *= shape[ri];
    if (strides[i] > 0) {
--- a/mlx/backend/io/CMakeLists.txt
+++ b/mlx/backend/io/CMakeLists.txt
@@ -0,0 +1,7 @@
+target_sources(
+  mlx
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/io_impl.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/thread_pool.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+)
--- a/mlx/backend/io/io_impl.cpp
+++ b/mlx/backend/io/io_impl.cpp
@@ -0,0 +1,72 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/io/io_impl.h"
+#include "mlx/backend/io/thread_pool.h"
+#include "mlx/primitives.h"
+#include "mlx/scheduler.h"
+
+namespace mlx::core::io {
+
+namespace {
+
+detail::ThreadPool& thread_pool() {
+  static std::unique_ptr<detail::ThreadPool> pool_ptr;
+
+  if (pool_ptr == nullptr) {
+    pool_ptr = std::make_unique<detail::ThreadPool>(4);
+  }
+
+  return *pool_ptr;
+}
+
+} // namespace
+
+std::function<void()> make_task(array arr, bool signal) {
+  return [arr = std::move(arr), signal]() mutable {
+    auto stream = arr.primitive().stream();
+
+    // Wait on inputs coming from different streams/devices.
+    for (auto& input : arr.inputs()) {
+      if (input.event().valid() && input.event().stream() != stream) {
+        input.event().wait();
+      }
+    }
+
+    // Task computation actually starting.
+    scheduler::notify_new_task(stream);
+
+    // Schedule the computation
+    auto inputs = arr.inputs();
+    auto outputs = arr.outputs();
+    thread_pool().enqueue(
+        [arr = std::move(arr), inputs, outputs, signal, stream]() mutable {
+          // Perform the computation
+          arr.primitive().eval_io(inputs, outputs);
+
+          if (!arr.is_tracer()) {
+            arr.detach();
+          }
+
+          if (signal) {
+            thread_pool().barrier(
+                [arr = std::move(arr)]() { arr.event().signal(); });
+          }
+
+          // Task computation done.
+          scheduler::notify_task_completion(stream);
+        },
+        inputs,
+        outputs);
+  };
+}
+
+std::function<void()> make_synchronize_task(
+    Stream s,
+    std::shared_ptr<std::promise<void>> p) {
+  return [p = std::move(p)]() {
+    thread_pool().barrier().wait();
+    p->set_value();
+  };
+}
+
+} // namespace mlx::core::io
--- a/mlx/backend/io/io_impl.h
+++ b/mlx/backend/io/io_impl.h
@@ -0,0 +1,18 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <functional>
+#include <future>
+#include <memory>
+
+#include "mlx/array.h"
+
+namespace mlx::core::io {
+
+std::function<void()> make_task(array arr, bool signal);
+std::function<void()> make_synchronize_task(
+    Stream s,
+    std::shared_ptr<std::promise<void>> p);
+
+} // namespace mlx::core::io
--- a/mlx/backend/io/primitives.cpp
+++ b/mlx/backend/io/primitives.cpp
@@ -0,0 +1,60 @@
+// Copyright © 2024 Apple Inc.
+
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+#include "mlx/allocator.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <const uint8_t scalar_size>
+void swap_endianness(uint8_t* data_bytes, size_t N) {
+  struct Elem {
+    uint8_t bytes[scalar_size];
+  };
+
+  Elem* data = reinterpret_cast<Elem*>(data_bytes);
+
+  for (size_t i = 0; i < N; i++) {
+    for (size_t j = 0; j < (scalar_size / 2); j++) {
+      std::swap(data[i].bytes[j], data[i].bytes[scalar_size - j - 1]);
+    }
+  }
+}
+
+} // namespace
+
+void Load::eval_io(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 0);
+  array& out = outputs[0];
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  {
+    std::lock_guard lock(*reader_);
+
+    reader_->seek(offset_, std::ios_base::beg);
+    reader_->read(out.data<char>(), out.nbytes());
+  }
+
+  if (swap_endianness_) {
+    switch (out.itemsize()) {
+      case 2:
+        swap_endianness<2>(out.data<uint8_t>(), out.data_size());
+        break;
+      case 4:
+        swap_endianness<4>(out.data<uint8_t>(), out.data_size());
+        break;
+      case 8:
+        swap_endianness<8>(out.data<uint8_t>(), out.data_size());
+        break;
+    }
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/io/thread_pool.cpp
+++ b/mlx/backend/io/thread_pool.cpp
@@ -0,0 +1,216 @@
+// Copyright © 2024 Apple Inc.
+
+#include <numeric>
+
+#include "mlx/backend/io/thread_pool.h"
+
+namespace mlx::core::io::detail {
+
+ThreadPool::ThreadPool(int workers)
+    : task_queues_(workers),
+      queue_mutexes_(workers),
+      queue_cvs_(workers),
+      set_mutexes_(workers),
+      output_sets_(workers),
+      stop_(false) {
+  for (int i = 0; i < workers; i++) {
+    workers_.emplace_back(&ThreadPool::worker, this, i);
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  stop_ = true;
+  for (auto& cv : queue_cvs_) {
+    cv.notify_one();
+  }
+
+  for (auto& t : workers_) {
+    if (t.joinable()) {
+      t.join();
+    }
+  }
+}
+
+std::future<void> ThreadPool::enqueue(
+    std::function<void()> task,
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs) {
+  std::vector<int> barriers;
+  if (!inputs.empty()) {
+    for (int i = 0; i < output_sets_.size(); i++) {
+      std::lock_guard<std::mutex> lock(set_mutexes_[i]);
+
+      for (auto& a : inputs) {
+        if (output_sets_[i].find(a.id()) != output_sets_[i].end()) {
+          barriers.push_back(i);
+          break;
+        }
+      }
+    }
+  }
+
+  // Case 1: Barriers is empty so try to add it to the smallest queue
+  if (barriers.empty()) {
+    auto min_queue = std::min_element(
+        task_queues_.begin(),
+        task_queues_.end(),
+        [](const auto& left, const auto& right) {
+          return left.size() < right.size();
+        });
+    int worker_idx = std::distance(task_queues_.begin(), min_queue);
+
+    add_outputs_to_worker(outputs, worker_idx);
+    return enqueue(
+        remove_outputs_when_done(std::move(task), outputs, worker_idx),
+        worker_idx);
+  }
+
+  // Case 2: Barriers has only one queue so put that into that queue
+  if (barriers.size() == 1) {
+    int worker_idx = barriers[0];
+    add_outputs_to_worker(outputs, worker_idx);
+    return enqueue(
+        remove_outputs_when_done(std::move(task), outputs, worker_idx),
+        worker_idx);
+  }
+
+  // Case 3: We need to add a barrier before our task and add it to the
+  // smallest queue of the barriers.
+  auto min_queue = std::min_element(
+      barriers.begin(), barriers.end(), [this](int left, int right) {
+        return task_queues_[left].size() < task_queues_[right].size();
+      });
+  int worker_idx = *min_queue;
+  barriers.erase(min_queue);
+  std::shared_future<void> queue_barrier =
+      barrier(barriers); // We shouldn't need shared future here
+  add_outputs_to_worker(outputs, worker_idx);
+  return enqueue(
+      remove_outputs_when_done(
+          [queue_barrier = std::move(queue_barrier),
+           og_task = std::move(task)]() {
+            queue_barrier.wait();
+            og_task();
+          },
+          outputs,
+          worker_idx),
+      worker_idx);
+}
+
+std::future<void> ThreadPool::enqueue(
+    std::function<void()> task,
+    int worker_idx) {
+  std::packaged_task<void()> pt(std::move(task));
+  std::future<void> result = pt.get_future();
+  {
+    std::lock_guard<std::mutex> lock(queue_mutexes_[worker_idx]);
+    task_queues_[worker_idx].emplace(std::move(pt));
+  }
+  queue_cvs_[worker_idx].notify_one();
+  return result;
+}
+
+void ThreadPool::add_outputs_to_worker(
+    const std::vector<array>& outputs,
+    int worker_idx) {
+  if (outputs.size() == 0) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(set_mutexes_[worker_idx]);
+  for (auto& a : outputs) {
+    output_sets_[worker_idx].insert(a.id());
+  }
+}
+
+std::function<void()> ThreadPool::remove_outputs_when_done(
+    std::function<void()> task,
+    const std::vector<array>& outputs,
+    int worker_idx) {
+  if (outputs.size() == 0) {
+    return task;
+  }
+
+  std::vector<std::uintptr_t> output_ids;
+  for (auto& a : outputs) {
+    output_ids.push_back(a.id());
+  }
+
+  return [og_task = std::move(task),
+          ids = std::move(output_ids),
+          worker_idx,
+          this]() {
+    og_task();
+    {
+      std::lock_guard<std::mutex> lock(set_mutexes_[worker_idx]);
+      for (auto id : ids) {
+        output_sets_[worker_idx].erase(id);
+      }
+    }
+  };
+}
+
+std::future<void> ThreadPool::barrier(
+    const std::vector<int>& worker_ids,
+    std::function<void()> on_barrier) {
+  auto workers = std::make_shared<std::atomic<int>>(worker_ids.size());
+  auto promise = std::make_shared<std::promise<void>>();
+  auto future = promise->get_future();
+
+  for (auto idx : worker_ids) {
+    enqueue(
+        [workers, promise, on_barrier = std::move(on_barrier)]() {
+          (*workers)--;
+          if (*workers <= 0) {
+            on_barrier();
+            promise->set_value();
+          }
+        },
+        idx);
+  }
+
+  return future;
+}
+
+std::future<void> ThreadPool::barrier(const std::vector<int>& worker_ids) {
+  auto noop = []() {};
+  return barrier(worker_ids, std::move(noop));
+}
+
+std::future<void> ThreadPool::barrier(std::function<void()> on_barrier) {
+  std::vector<int> worker_ids(workers_.size());
+  std::iota(worker_ids.begin(), worker_ids.end(), 0);
+  return barrier(worker_ids, std::move(on_barrier));
+}
+
+std::future<void> ThreadPool::barrier() {
+  auto noop = []() {};
+  return barrier(std::move(noop));
+}
+
+void ThreadPool::worker(int idx) {
+  while (true) {
+    std::packaged_task<void()> task;
+    {
+      std::unique_lock<std::mutex> lock(queue_mutexes_[idx]);
+      queue_cvs_[idx].wait(
+          lock, [this, idx]() { return stop_ || !task_queues_[idx].empty(); });
+      if (task_queues_[idx].empty()) {
+        if (stop_) {
+          break;
+        } else {
+          continue;
+        }
+      }
+      task = std::move(task_queues_[idx].front());
+      task_queues_[idx].pop();
+    }
+    try {
+      task();
+    } catch (...) {
+      // do nothing?
+    }
+  }
+}
+
+} // namespace mlx::core::io::detail
--- a/mlx/backend/io/thread_pool.h
+++ b/mlx/backend/io/thread_pool.h
@@ -0,0 +1,52 @@
+// Copyright © 2024 Apple Inc.
+
+#pragma once
+
+#include <future>
+#include <queue>
+#include <unordered_set>
+
+#include "mlx/array.h"
+
+namespace mlx::core::io::detail {
+
+class ThreadPool {
+ public:
+  explicit ThreadPool(int workers);
+  ~ThreadPool();
+
+  ThreadPool(ThreadPool&&) = delete;
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator=(ThreadPool&&) = delete;
+  ThreadPool& operator=(const ThreadPool&) = delete;
+
+  std::future<void> enqueue(
+      std::function<void()> task,
+      const std::vector<array>& inputs,
+      const std::vector<array>& outputs);
+  std::future<void> barrier(
+      const std::vector<int>& worker_ids,
+      std::function<void()> on_barrier);
+  std::future<void> barrier(const std::vector<int>& worker_ids);
+  std::future<void> barrier(std::function<void()> on_barrier);
+  std::future<void> barrier();
+
+ private:
+  std::future<void> enqueue(std::function<void()> task, int worker_idx);
+  void add_outputs_to_worker(const std::vector<array>& outputs, int worker_idx);
+  std::function<void()> remove_outputs_when_done(
+      std::function<void()> task,
+      const std::vector<array>& outputs,
+      int worker_idx);
+  void worker(int idx);
+
+  std::vector<std::queue<std::packaged_task<void()>>> task_queues_;
+  std::vector<std::mutex> queue_mutexes_;
+  std::vector<std::condition_variable> queue_cvs_;
+  std::vector<std::mutex> set_mutexes_;
+  std::vector<std::unordered_set<std::uintptr_t>> output_sets_;
+  bool stop_;
+  std::vector<std::thread> workers_;
+};
+
+} // namespace mlx::core::io::detail
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -1,141 +1,33 @@
-function(make_jit_source SRC_FILE)
-  # This function takes a metal header file,
-  # runs the C preprocessesor on it, and makes
-  # the processed contents available as a string in a C++ function
-  # mlx::core::metal::${SRC_NAME}()
-  #
-  # To use the function, declare it in jit/includes.h and
-  # include jit/includes.h.
-  #
-  # Additional arguments to this function are treated as dependencies
-  # in the Cmake build system.
-  get_filename_component(SRC_NAME ${SRC_FILE} NAME)
-  add_custom_command(
-    OUTPUT  jit/${SRC_NAME}.cpp
+add_custom_command(
+    OUTPUT  compiled_preamble.cpp
    COMMAND /bin/bash
              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
-              ${CMAKE_CURRENT_BINARY_DIR}/jit
+              ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
              ${CMAKE_C_COMPILER}
              ${PROJECT_SOURCE_DIR}
-              ${SRC_FILE}
-              "-DMLX_METAL_VERSION=${MLX_METAL_VERSION}"
    DEPENDS make_compiled_preamble.sh
-            kernels/${SRC_FILE}.h
-            ${ARGN}
-  )
-  add_custom_target(${SRC_NAME} DEPENDS jit/${SRC_NAME}.cpp)
-  add_dependencies(mlx ${SRC_NAME})
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp
-  )
-endfunction(make_jit_source)
+            kernels/compiled_preamble.h
+            kernels/unary.h
+            kernels/binary.h
+)

-make_jit_source(
-  utils
-  kernels/bf16.h
-  kernels/complex.h
-  kernels/defines.h
+add_custom_target(
+  compiled_preamble
+  DEPENDS compiled_preamble.cpp
 )
-make_jit_source(
-  unary_ops
-  kernels/erf.h
-  kernels/expm1f.h
-)
-make_jit_source(binary_ops)
-make_jit_source(ternary_ops)
-make_jit_source(
-  reduce_utils
-  kernels/atomic.h
-  kernels/reduction/ops.h
-)
-make_jit_source(scatter)
-make_jit_source(gather)
-make_jit_source(hadamard)

-if (MLX_METAL_JIT) 
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp
-  )
-  make_jit_source(arange)
-  make_jit_source(copy)
-  make_jit_source(unary)
-  make_jit_source(binary)
-  make_jit_source(binary_two)
-  make_jit_source(
-    fft
-    kernels/fft/radix.h
-    kernels/fft/readwrite.h
-  )
-  make_jit_source(ternary)
-  make_jit_source(softmax)
-  make_jit_source(scan)
-  make_jit_source(sort)
-  make_jit_source(
-    reduce
-    kernels/reduction/reduce_all.h
-    kernels/reduction/reduce_col.h
-    kernels/reduction/reduce_row.h
-  )
-  make_jit_source(
-    steel/gemm/gemm
-    kernels/steel/utils.h
-    kernels/steel/gemm/loader.h
-    kernels/steel/gemm/mma.h
-    kernels/steel/gemm/params.h
-    kernels/steel/gemm/transforms.h
-  )
-  make_jit_source(steel/gemm/kernels/steel_gemm_fused)
-  make_jit_source(
-    steel/gemm/kernels/steel_gemm_masked
-    kernels/steel/defines.h
-  )
-  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
-  make_jit_source(
-    steel/conv/conv
-    kernels/steel/utils.h
-    kernels/steel/defines.h
-    kernels/steel/gemm/mma.h
-    kernels/steel/gemm/transforms.h
-    kernels/steel/conv/params.h
-    kernels/steel/conv/loader.h
-    kernels/steel/conv/loaders/loader_channel_l.h
-    kernels/steel/conv/loaders/loader_channel_n.h
-  )
-  make_jit_source(
-    steel/conv/kernels/steel_conv
-  )
-  make_jit_source(
-    steel/conv/kernels/steel_conv_general
-    kernels/steel/defines.h
-    kernels/steel/conv/loaders/loader_general.h
-  )
-  make_jit_source(quantized)
-  make_jit_source(gemv_masked)
-else()
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp
-  )
-endif()
+add_dependencies(mlx compiled_preamble)

 target_sources(
  mlx
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
@@ -145,13 +37,10 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )

 if (NOT MLX_METAL_PATH)
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -242,17 +242,8 @@ void MetalAllocator::free(Buffer buffer) {
 }

 MetalAllocator& allocator() {
-  // By creating the |allocator_| on heap, the destructor of MetalAllocator will
-  // not be called on exit and all the buffers will be leaked. This is necessary
-  // because releasing buffers can take more than 30sec when the program holds a
-  // lot of RAM (for example inferencing a LLM), and it would feel frozen to
-  // users when exiting.
-  // TODO(zcbenz): Consider using the `base::NoDestructor` class from Chromium
-  // when applying this pattern to more places, or when introducing sanitizers
-  // to MLX.
-  // https://source.chromium.org/chromium/chromium/src/+/main:base/no_destructor.h
-  static MetalAllocator* allocator_ = new MetalAllocator;
-  return *allocator_;
+  static MetalAllocator allocator_;
+  return allocator_;
 }

 size_t set_cache_limit(size_t limit) {
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -1,296 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/backend/common/binary.h"
-#include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels.h"
-#include "mlx/backend/metal/utils.h"
-#include "mlx/primitives.h"
-
-#define BINARY_GPU(func)                                              \
-  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
-    binary_op_gpu(inputs, out, get_primitive_string(this));           \
-  }
-
-#define BINARY_GPU_MULTI(func)                                         \
-  void func::eval_gpu(                                                 \
-      const std::vector<array>& inputs, std::vector<array>& outputs) { \
-    binary_op_gpu(inputs, outputs, get_primitive_string(this));        \
-  }
-
-namespace mlx::core {
-
-constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;
-
-std::string get_kernel_name(
-    BinaryOpType bopt,
-    const std::string& op,
-    const array& a,
-    bool use_2d,
-    int ndim) {
-  std::ostringstream kname;
-  switch (bopt) {
-    case BinaryOpType::ScalarScalar:
-      kname << "ss";
-      break;
-    case BinaryOpType::ScalarVector:
-      kname << (use_2d ? "sv2" : "sv");
-      break;
-    case BinaryOpType::VectorScalar:
-      kname << (use_2d ? "vs2" : "vs");
-      break;
-    case BinaryOpType::VectorVector:
-      kname << (use_2d ? "vv2" : "vv");
-      break;
-    case BinaryOpType::General:
-      kname << "g";
-      if (ndim <= MAX_BINARY_SPECIALIZED_DIMS) {
-        kname << ndim;
-      } else {
-        kname << "n";
-      }
-      break;
-  }
-  kname << op << type_to_name(a);
-  return kname.str();
-}
-
-void binary_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const std::string& op,
-    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-
-  auto& out = outputs[0];
-  if (out.size() == 0) {
-    return;
-  }
-
-  // Try to collapse contiguous dims
-  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-  auto& strides_a = strides[0];
-  auto& strides_b = strides[1];
-  auto& strides_out = strides[2];
-
-  bool use_2d = out.data_size() > UINT32_MAX;
-  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
-  auto& d = metal::device(s.device);
-
-  auto kernel =
-      get_binary_two_kernel(d, kernel_name, a.dtype(), outputs[0].dtype(), op);
-
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
-
-  // - If a is donated it goes to the first output
-  // - If b is donated it goes to the first output if a was not donated
-  //   otherwise it goes to the second output
-  bool donate_a = a.data_shared_ptr() == nullptr;
-  bool donate_b = b.data_shared_ptr() == nullptr;
-  compute_encoder.set_input_array(donate_a ? outputs[0] : a, 0);
-  compute_encoder.set_input_array(
-      donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, 1);
-  compute_encoder.set_output_array(outputs[0], 2);
-  compute_encoder.set_output_array(outputs[1], 3);
-
-  if (bopt == BinaryOpType::General) {
-    auto ndim = shape.size();
-    if (ndim > 3) {
-      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 4);
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 5);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 6);
-    } else {
-      // The shape is implicit in the grid for <= 3D
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
-    }
-
-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
-      compute_encoder->setBytes(&ndim, sizeof(int), 7);
-    }
-
-    // Launch up to 3D grid of threads
-    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
-    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
-    size_t rest = out.size() / (dim0 * dim1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size != 1024) {
-      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
-    }
-    auto group_dims = get_block_dims(dim0, dim1, rest);
-    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  } else {
-    // Launch a 1D or 2D grid of threads
-    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d
-        ? get_2d_grid_dims(outputs[0].shape(), outputs[0].strides())
-        : MTL::Size(nthreads, 1, 1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size > nthreads) {
-      thread_group_size = nthreads;
-    }
-    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  }
-}
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const std::string& op,
-    const Stream& s) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, outputs[0], bopt, true);
-  set_binary_op_output_data(a, b, outputs[1], bopt, true);
-  binary_op_gpu_inplace(inputs, outputs, op, s);
-}
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const std::string& op) {
-  auto& s = outputs[0].primitive().stream();
-  binary_op_gpu(inputs, outputs, op, s);
-}
-
-void binary_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    array& out,
-    const std::string& op,
-    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  if (out.size() == 0) {
-    return;
-  }
-
-  // Try to collapse contiguous dims
-  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-  auto& strides_a = strides[0];
-  auto& strides_b = strides[1];
-  auto& strides_out = strides[2];
-
-  bool use_2d = out.data_size() > UINT32_MAX;
-  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
-  auto& d = metal::device(s.device);
-
-  auto kernel = get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
-  bool donate_a = a.data_shared_ptr() == nullptr;
-  bool donate_b = b.data_shared_ptr() == nullptr;
-  compute_encoder.set_input_array(donate_a ? out : a, 0);
-  compute_encoder.set_input_array(donate_b ? out : b, 1);
-  compute_encoder.set_output_array(out, 2);
-
-  if (bopt == BinaryOpType::General) {
-    auto ndim = shape.size();
-    if (ndim > 3) {
-      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 3);
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
-    } else {
-      // The shape is implicit in the grid for <= 3D
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 3);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 4);
-    }
-
-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
-      compute_encoder->setBytes(&ndim, sizeof(int), 6);
-    }
-
-    // Launch up to 3D grid of threads
-    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
-    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
-    size_t rest = out.size() / (dim0 * dim1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size != 1024) {
-      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
-    }
-    auto group_dims = get_block_dims(dim0, dim1, rest);
-    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  } else {
-    // Launch a 1D or 2D grid of threads
-
-    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
-                                 : MTL::Size(nthreads, 1, 1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size > nthreads) {
-      thread_group_size = nthreads;
-    }
-    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  }
-}
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    array& out,
-    const std::string& op,
-    const Stream& s) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt, true);
-  binary_op_gpu_inplace(inputs, out, op, s);
-}
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    array& out,
-    const std::string& op) {
-  auto& s = out.primitive().stream();
-  binary_op_gpu(inputs, out, op, s);
-}
-
-BINARY_GPU(Add)
-BINARY_GPU(ArcTan2)
-BINARY_GPU(Divide)
-BINARY_GPU_MULTI(DivMod)
-BINARY_GPU(Remainder)
-BINARY_GPU(Equal)
-BINARY_GPU(Greater)
-BINARY_GPU(GreaterEqual)
-BINARY_GPU(Less)
-BINARY_GPU(LessEqual)
-BINARY_GPU(LogicalAnd)
-BINARY_GPU(LogicalOr)
-BINARY_GPU(LogAddExp)
-BINARY_GPU(Maximum)
-BINARY_GPU(Minimum)
-BINARY_GPU(Multiply)
-BINARY_GPU(NotEqual)
-BINARY_GPU(Power)
-BINARY_GPU(Subtract)
-
-void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
-  switch (op_) {
-    case BitwiseBinary::And:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
-      break;
-    case BitwiseBinary::Or:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
-      break;
-    case BitwiseBinary::Xor:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
-      break;
-    case BitwiseBinary::LeftShift:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
-      break;
-    case BitwiseBinary::RightShift:
-      binary_op_gpu(inputs, out, get_primitive_string(this));
-      break;
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/metal/binary.h
+++ b/mlx/backend/metal/binary.h
@@ -1,33 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#pragma once
-
-#include "mlx/array.h"
-
-namespace mlx::core {
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const std::string& op,
-    const Stream& s);
-
-void binary_op_gpu(
-    const std::vector<array>& inputs,
-    array& out,
-    const std::string& op,
-    const Stream& s);
-
-void binary_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs,
-    const std::string& op,
-    const Stream& s);
-
-void binary_op_gpu_inplace(
-    const std::vector<array>& inputs,
-    array& out,
-    const std::string& op,
-    const Stream& s);
-
-} // namespace mlx::core
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -4,8 +4,8 @@

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/utils.h"
+#include "mlx/backend/metal/compiled_preamble.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/jit/includes.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/graph_utils.h"
 #include "mlx/primitives.h"
@@ -56,15 +56,12 @@ inline void build_kernel(
    } else {
      add_indices = true;
      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
-         << " [[buffer(" << cnt++ << ")]]," << std::endl;
+         << " [[buffer(" << cnt++ << ")]]," << std::endl
+         << "    constant const size_t* " << xname << "_strides [[buffer("
+         << cnt++ << ")]]," << std::endl;
    }
  }

-  if (add_indices) {
-    os << "    constant const size_t* in_strides [[buffer(" << cnt++
-       << ")]],\n";
-  }
-
  // Add the output arguments
  for (auto& x : outputs) {
    os << "    device " << get_type_string(x.dtype()) << "* "
@@ -113,17 +110,13 @@ inline void build_kernel(
  }

  // Read the inputs in tmps
-  int nc_in_count = 0;
-  for (int i = 0; i < inputs.size(); ++i) {
-    auto& x = inputs[i];
+  for (auto& x : inputs) {
    auto& xname = namer.get_name(x);

    if (is_constant(x)) {
-      auto type_str = get_type_string(x.dtype());
-      os << "  auto tmp_" << xname << " = static_cast<"
-         << get_type_string(x.dtype()) << ">(";
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
-      os << ");" << std::endl;
+      os << ";" << std::endl;
    } else if (is_scalar(x)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[0];" << std::endl;
@@ -131,20 +124,17 @@ inline void build_kernel(
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[index];" << std::endl;
    } else if (!dynamic_dims) {
-      int offset = nc_in_count * ndim;
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[";
-      os << "index_0 * " << "in_strides[" << offset << "]";
+      os << "index_0 * " << xname << "_strides[0]";
      for (int i = 1; i < ndim; i++) {
-        os << " + index_" << i << " * " << "in_strides[" << offset + i << "]";
+        os << " + index_" << i << " * " << xname << "_strides[" << i << "]";
      }
      os << "];" << std::endl;
-      nc_in_count++;
    } else {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[elem_to_loc(index, output_shape, in_strides + "
-         << nc_in_count * ndim << ", ndim)];" << std::endl;
-      nc_in_count++;
+         << xname << "[elem_to_loc(index, output_shape, " << xname
+         << "_strides, ndim)];" << std::endl;
    }
  }

@@ -200,8 +190,7 @@ void Compiled::eval_gpu(
  // If not we have to build it ourselves
  if (lib == nullptr) {
    std::ostringstream kernel;
-    kernel << metal::utils() << metal::unary_ops() << metal::binary_ops()
-           << metal::ternary_ops();
+    kernel << metal::get_kernel_preamble() << std::endl;
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
@@ -306,7 +295,6 @@ void Compiled::eval_gpu(
  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
-  std::vector<size_t> in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
@@ -314,17 +302,13 @@ void Compiled::eval_gpu(
    auto& x = inputs[i];
    compute_encoder.set_input_array(x, cnt++);
    if (!contiguous && !is_scalar(x)) {
-      in_strides.insert(
-          in_strides.end(),
-          strides[stride_idx].begin(),
-          strides[stride_idx].end());
+      compute_encoder->setBytes(
+          strides[stride_idx].data(),
+          strides[stride_idx].size() * sizeof(size_t),
+          cnt++);
      stride_idx++;
    }
  }
-  if (!in_strides.empty()) {
-    compute_encoder->setBytes(
-        in_strides.data(), in_strides.size() * sizeof(size_t), cnt++);
-  }

  compiled_allocate_outputs(
      inputs, outputs, inputs_, constant_ids_, contiguous, true);
@@ -352,7 +336,7 @@ void Compiled::eval_gpu(
    MTL::Size grid_dims(nthreads, 1, 1);
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
@@ -363,7 +347,7 @@ void Compiled::eval_gpu(
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/compiled_preamble.h
+++ b/mlx/backend/metal/compiled_preamble.h
@@ -0,0 +1,9 @@
+// Copyright © 2023-24 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::metal {
+
+const char* get_kernel_preamble();
+
+}
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -7,7 +7,6 @@

 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels.h"
 #include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/kernels/steel/conv/params.h"
 #include "mlx/backend/metal/matmul.h"
@@ -60,7 +59,7 @@ void explicit_gemm_conv_ND_gpu(
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);

-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder->dispatchThreads(grid_dims, group_dims);

  // Reshape weight
  std::vector<int> wt_reshape{implicit_K, implicit_N};
@@ -138,7 +137,7 @@ void explicit_gemm_conv_group_ND_gpu(
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);

-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder->dispatchThreads(grid_dims, group_dims);

  // Transpose kernel weights so that we can slice them by contiguous chunks
  // of channel groups.
@@ -248,7 +247,7 @@ void slow_conv_2D_gpu(
  compute_encoder.set_output_array(out, 2);

  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_gpu(
@@ -258,19 +257,15 @@ void implicit_gemm_conv_2D_gpu(
    const array& wt,
    array out,
    const MLXConvParams<2>& conv_params) {
-  const int groups = conv_params.groups;
-  const int C_per_group = conv_params.C / conv_params.groups;
-  const int O_per_group = conv_params.O / conv_params.groups;
-
  // Deduce implicit gemm size
-  const int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
-  const int implicit_N = O_per_group;
-  const int implicit_K = conv_params.wS[0] * conv_params.wS[1] * C_per_group;
+  int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
+  int implicit_N = conv_params.O;
+  int implicit_K = conv_params.wS[0] * conv_params.wS[1] * conv_params.C;

  // Determine block and warp tiles
  int wm = 2, wn = 2;

-  int bm = implicit_M >= 8192 && C_per_group >= 64 ? 64 : 32;
+  int bm = implicit_M >= 8192 && conv_params.C >= 64 ? 64 : 32;
  int bn = (bm == 64 || implicit_N >= 64) ? 64 : 32;
  int bk = 16;

@@ -286,15 +281,15 @@ void implicit_gemm_conv_2D_gpu(

  // Fix small channel specialization
  int n_channel_specialization = 0;
-  int channel_k_iters = ((C_per_group + bk - 1) / bk);
+  int channel_k_iters = ((conv_params.C + bk - 1) / bk);
  int gemm_k_iters = conv_params.wS[0] * conv_params.wS[1] * channel_k_iters;

-  if (C_per_group <= 2) {
+  if (conv_params.C <= 2) {
    gemm_k_iters = (implicit_K + bk - 1) / bk;
-    n_channel_specialization = C_per_group;
-  } else if (C_per_group <= 4) {
+    n_channel_specialization = conv_params.C;
+  } else if (conv_params.C <= 4) {
    gemm_k_iters = ((conv_params.wS[0] * conv_params.wS[1] * 4) + bk - 1) / bk;
-    n_channel_specialization = C_per_group;
+    n_channel_specialization = conv_params.C;
  }

  bool small_filter = (!n_channel_specialization) &&
@@ -336,17 +331,7 @@ void implicit_gemm_conv_2D_gpu(

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel = get_steel_conv_kernel(
-      d,
-      kname.str(),
-      out,
-      bm,
-      bn,
-      bk,
-      wm,
-      wn,
-      n_channel_specialization,
-      small_filter);
+  auto kernel = d.get_kernel(kname.str());
  compute_encoder->setComputePipelineState(kernel);

  // Deduce grid launch dimensions
@@ -355,7 +340,7 @@ void implicit_gemm_conv_2D_gpu(
  size_t grid_dim_x = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
-  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, groups);
+  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, 1);

  // Encode arrays
  compute_encoder.set_input_array(in, 0);
@@ -367,7 +352,7 @@ void implicit_gemm_conv_2D_gpu(
  compute_encoder->setBytes(&gemm_params, sizeof(ImplicitGemmConv2DParams), 4);

  // Launch kernel
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_general_gpu(
@@ -499,8 +484,7 @@ void implicit_gemm_conv_2D_general_gpu(

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  auto kernel =
-      get_steel_conv_general_kernel(d, kname.str(), out, bm, bn, bk, wm, wn);
+  auto kernel = d.get_kernel(kname.str());
  compute_encoder->setComputePipelineState(kernel);

  // Deduce grid launch dimensions
@@ -528,7 +512,7 @@ void implicit_gemm_conv_2D_general_gpu(
      base_w.data(), sizeof(Conv2DGeneralBaseInfo) * base_w.size(), 7);

  // Launch kernel
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
 }

 void winograd_conv_2D_gpu(
@@ -629,7 +613,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, bo, 1);
    MTL::Size grid_dims = MTL::Size(O_c / bo, 1, 1);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
  }

  // Do input transform
@@ -657,7 +641,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
  }

  // Do batched gemm
@@ -705,7 +689,7 @@ void winograd_conv_2D_gpu(
    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
  }
 }

@@ -719,7 +703,6 @@ void conv_2D_gpu(
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
-    const int groups,
    bool flip,
    std::vector<array>& copies) {
  // Make conv params
@@ -735,12 +718,12 @@ void conv_2D_gpu(
      /* const int kdil[NDIM] = */ {wt_dilation[0], wt_dilation[1]},
      /* const int idil[NDIM] = */ {in_dilation[0], in_dilation[1]},
      /* const size_t in_strides[NDIM + 2] = */
-      {in.strides(0), in.strides(1), in.strides(2), in.strides(3)},
+      {in.strides()[0], in.strides()[1], in.strides()[2], in.strides()[3]},
      /* const size_t wt_strides[NDIM + 2] = */
-      {wt.strides(0), wt.strides(1), wt.strides(2), wt.strides(3)},
+      {wt.strides()[0], wt.strides()[1], wt.strides()[2], wt.strides()[3]},
      /* const size_t out_strides[NDIM + 2] = */
-      {out.strides(0), out.strides(1), out.strides(2), out.strides(3)},
-      /* const int groups = */ groups,
+      {out.strides()[0], out.strides()[1], out.strides()[2], out.strides()[3]},
+      /* const int groups = */ 1,
      /* const bool flip = */ flip,
  };

@@ -752,18 +735,6 @@ void conv_2D_gpu(
  bool channels_large = (conv_params.C + conv_params.O) >= 512;
  bool channels_med = (conv_params.C + conv_params.O) >= 256;

-  if (groups > 1) {
-    const int C_per_group = conv_params.C / groups;
-    const int O_per_group = conv_params.O / groups;
-
-    if (is_idil_one && (C_per_group <= 4 || C_per_group % 16 == 0) &&
-        (O_per_group <= 16 || O_per_group % 16 == 0)) {
-      return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
-    } else {
-      return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
-    }
-  }
-
  // Direct to winograd conv
  if (!flip && is_stride_one && is_kdil_one && is_idil_one &&
      conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
@@ -788,56 +759,6 @@ void conv_2D_gpu(
  }
 }

-void conv_3D_gpu(
-    const Stream& s,
-    metal::Device& d,
-    const array& in,
-    const array& wt,
-    array out,
-    const std::vector<int>& padding,
-    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation,
-    const std::vector<int>& in_dilation,
-    bool flip,
-    std::vector<array>& copies) {
-  // Make conv params
-  MLXConvParams<3> conv_params{
-      /* const int  N = */ in.shape(0),
-      /* const int  C = */ in.shape(4),
-      /* const int  O = */ wt.shape(0),
-      /* const int iS[NDIM] = */ {in.shape(1), in.shape(2), in.shape(3)},
-      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2), wt.shape(3)},
-      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2), out.shape(3)},
-      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1], wt_strides[2]},
-      /* const int pad[NDIM] = */ {padding[0], padding[1], padding[2]},
-      /* const int kdil[NDIM] = */
-      {wt_dilation[0], wt_dilation[1], wt_dilation[2]},
-      /* const int idil[NDIM] = */
-      {in_dilation[0], in_dilation[1], in_dilation[2]},
-      /* const size_t in_strides[NDIM + 2] = */
-      {in.strides()[0],
-       in.strides()[1],
-       in.strides()[2],
-       in.strides()[3],
-       in.strides()[4]},
-      /* const size_t wt_strides[NDIM + 2] = */
-      {wt.strides()[0],
-       wt.strides()[1],
-       wt.strides()[2],
-       wt.strides()[3],
-       wt.strides()[4]},
-      /* const size_t out_strides[NDIM + 2] = */
-      {out.strides()[0],
-       out.strides()[1],
-       out.strides()[2],
-       out.strides()[3],
-       out.strides()[4]},
-      /* const int groups = */ 1,
-      /* const bool flip = */ flip,
-  };
-  return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
-}
-
 } // namespace

 void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -862,23 +783,8 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
    wt = arr_copy;
  }

-  // 3D conv
-  if (out.ndim() == 5) {
-    conv_3D_gpu(
-        s,
-        d,
-        in,
-        wt,
-        out,
-        padding_,
-        kernel_strides_,
-        kernel_dilation_,
-        input_dilation_,
-        flip_,
-        copies);
-  }
  // 2D conv
-  else if (out.ndim() == 4) {
+  if (out.ndim() == 4) {
    conv_2D_gpu(
        s,
        d,
@@ -889,7 +795,6 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
-        groups_,
        flip_,
        copies);
  }
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -4,14 +4,12 @@

 #include "mlx/backend/metal/copy.h"
 #include "mlx/backend/metal/device.h"
-#include "mlx/backend/metal/kernels.h"
+#include "mlx/backend/metal/kernels/defines.h"
 #include "mlx/backend/metal/utils.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

-constexpr int MAX_COPY_SPECIALIZED_DIMS = 5;
-
 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
@@ -33,6 +31,9 @@ void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
+  if (out.size() == 0) {
+    return;
+  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
@@ -54,46 +55,34 @@ void copy_gpu_inplace(
    int64_t out_offset,
    CopyType ctype,
    const Stream& s) {
-  if (out.size() == 0) {
-    return;
-  }
-
  // Try to collapse contiguous dims
  auto [shape, strides] = collapse_contiguous_dims(
      data_shape, std::vector{strides_in_pre, strides_out_pre});
  auto& strides_in_ = strides[0];
  auto& strides_out_ = strides[1];

-  bool use_2d = out.data_size() > UINT32_MAX;
  auto& d = metal::device(s.device);
-  std::string kernel_name;
-  {
-    std::ostringstream kname;
-    switch (ctype) {
-      case CopyType::Scalar:
-        kname << (use_2d ? "s2" : "s");
-        break;
-      case CopyType::Vector:
-        kname << (use_2d ? "v2" : "v");
-        break;
-      case CopyType::General:
-        kname << "g";
-        break;
-      case CopyType::GeneralGeneral:
-        kname << "gg";
-        break;
-    }
-    if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
-        shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
-      kname << shape.size();
-    }
-    kname << "_copy";
-    kname << type_to_name(in) << type_to_name(out);
-    kernel_name = kname.str();
+  std::ostringstream kname;
+  switch (ctype) {
+    case CopyType::Scalar:
+      kname << "scopy";
+      break;
+    case CopyType::Vector:
+      kname << "vcopy";
+      break;
+    case CopyType::General:
+      kname << "gcopy";
+      break;
+    case CopyType::GeneralGeneral:
+      kname << "ggcopy";
+      break;
  }
-
-  auto kernel = get_copy_kernel(d, kernel_name, in, out);
-
+  kname << type_to_name(in) << type_to_name(out);
+  if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
+      shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
+    kname << "_" << shape.size();
+  }
+  auto kernel = d.get_kernel(kname.str());
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
  bool donate_in = in.data_shared_ptr() == nullptr;
@@ -117,7 +106,7 @@ void copy_gpu_inplace(
      set_vector_bytes(compute_encoder, strides_out, ndim, 4);
    }

-    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
+    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
      compute_encoder->setBytes(&ndim, sizeof(int), 5);
    }

@@ -137,17 +126,16 @@ void copy_gpu_inplace(

    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
  } else {
    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
-                                 : MTL::Size(nthreads, 1, 1);
+    MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -1,89 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/backend/metal/copy.h"
-#include "mlx/backend/metal/jit/includes.h"
-#include "mlx/backend/metal/utils.h"
-#include "mlx/fast_primitives.h"
-
-namespace mlx::core::fast {
-
-void CustomKernel::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  auto& s = stream();
-
-  std::vector<array> copies;
-
-  for (auto& out : outputs) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    if (init_value_) {
-      array init = array(init_value_.value(), out.dtype());
-      copy_gpu(init, out, CopyType::Scalar, s);
-      copies.push_back(init);
-    }
-  }
-
-  auto check_input = [&copies, &s, this](const array& x) -> const array {
-    bool no_copy = x.flags().row_contiguous;
-    if (!ensure_row_contiguous_ || no_copy) {
-      return x;
-    } else {
-      copies.push_back(array(x.shape(), x.dtype(), nullptr, {}));
-      copy_gpu(x, copies.back(), CopyType::General, s);
-      return copies.back();
-    }
-  };
-  std::vector<const array> checked_inputs;
-  for (const array& in : inputs) {
-    checked_inputs.push_back(check_input(in));
-  }
-
-  auto& d = metal::device(s.device);
-  const auto& lib_name = name_;
-  auto lib = d.get_library(lib_name);
-  if (lib == nullptr) {
-    lib = d.get_library(lib_name, metal::utils() + source_);
-  }
-  auto kernel = d.get_kernel(name_, lib);
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
-  int index = 0;
-  for (int i = 0; i < checked_inputs.size(); i++) {
-    const array& in = checked_inputs[i];
-    auto shape_info = shape_infos_[i];
-    compute_encoder.set_input_array(in, index);
-    index++;
-    if (in.ndim() > 0) {
-      int ndim = in.ndim();
-      if (shape_info.shape) {
-        set_vector_bytes(compute_encoder, in.shape(), ndim, index);
-        index++;
-      }
-      if (shape_info.strides) {
-        set_vector_bytes(compute_encoder, in.strides(), ndim, index);
-        index++;
-      }
-      if (shape_info.ndim) {
-        compute_encoder->setBytes(&ndim, sizeof(int), index);
-        index++;
-      }
-    }
-  }
-  for (array out : outputs) {
-    compute_encoder.set_output_array(out, index);
-    index++;
-  }
-
-  const auto [tx, ty, tz] = threadgroup_;
-  MTL::Size group_dims = MTL::Size(tx, ty, tz);
-  const auto [gx, gy, gz] = grid_;
-  MTL::Size grid_dims = MTL::Size(gx, gy, gz);
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
-
-  if (!copies.empty()) {
-    d.get_command_buffer(s.index)->addCompletedHandler(
-        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
-  }
-}
-
-} // namespace mlx::core::fast
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	8242d6d5ef	Add locks to FileStream	2024-05-08 23:19:27 -07:00
Angelos Katharopoulos	bae159738f	Working IO primitives	2024-05-08 22:17:25 -07:00
Angelos Katharopoulos	b193741050	Change Load to be an IOPrimitive	2024-05-08 18:59:20 -07:00
Angelos Katharopoulos	c8e2b42ced	Add the io threadpool and task	2024-05-08 18:02:22 -07:00
Angelos Katharopoulos	be36f136de	Add io device and cpu::make_task	2024-05-07 16:58:14 -07:00