Add the 3bit packed qmm_t

Add 3bit packed quants
Fix the optional in gather_qmm python binding
2025-09-08 22:49:55 +08:00 · 2024-12-17 22:16:30 -08:00 · 2024-12-17 10:49:13 -08:00 · 2024-12-16 22:14:19 -08:00 · 2024-12-16 22:11:23 -08:00 · 2024-12-16 21:49:14 -08:00
274 changed files with 8379 additions and 14203 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -85,7 +85,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -137,7 +137,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -160,7 +160,6 @@ jobs:
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
      - run:
          name: Build example extension
          command: |
@@ -227,7 +226,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -292,7 +291,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,16 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.7
+    rev: v19.1.4
    hooks:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 25.1.0
+    rev: 24.10.0
    hooks:
    -   id: black
    
 -   repo: https://github.com/pycqa/isort
-    rev: 6.0.0
+    rev: 5.13.2
    hooks:
    -   id: isort
        args:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -7,7 +7,7 @@ with a short description of your contribution(s) below. For example:

 MLX was developed with contributions from the following individuals:

- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`. Added `orthogonal` initializer.
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
 - Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.25)
+cmake_minimum_required(VERSION 3.24)

 project(mlx LANGUAGES C CXX)

@@ -25,9 +25,8 @@ option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.22.1)
+  set(MLX_VERSION 0.21.1)
 endif()
-add_compile_definitions("MLX_VERSION=${MLX_VERSION}")

 # --------------------- Processor tests -------------------------

@@ -127,10 +126,7 @@ if(WIN32)
    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
    GIT_TAG v1.4.1
    EXCLUDE_FROM_ALL)
-  block()
-  set(BUILD_SHARED_LIBS OFF)
  FetchContent_MakeAvailable(dlfcn-win32)
-  endblock()
  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
  target_link_libraries(mlx PRIVATE dl)
 endif()
@@ -147,7 +143,6 @@ if(MLX_BUILD_CPU)

  if(MLX_BUILD_ACCELERATE)
    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
-    add_compile_definitions(MLX_USE_ACCELERATE)
    add_compile_definitions(ACCELERATE_NEW_LAPACK)
  elseif(MLX_BUILD_BLAS_FROM_SOURCE)
    # Download and build OpenBLAS from source code.
@@ -241,7 +236,8 @@ if(MLX_BUILD_PYTHON_BINDINGS)
  execute_process(
    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
    OUTPUT_STRIP_TRAILING_WHITESPACE
-    OUTPUT_VARIABLE nanobind_ROOT)
+    OUTPUT_VARIABLE NB_DIR)
+  list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
  find_package(nanobind CONFIG REQUIRED)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
 endif()
--- a/benchmarks/python/packed_qmm_bench.py
+++ b/benchmarks/python/packed_qmm_bench.py
@@ -0,0 +1,74 @@
+import argparse
+import math
+
+import mlx.core as mx
+from time_utils import time_fn
+
+B = 1024
+D = 1024
+M = 4 * D
+group_size = 64
+bits = 4
+dtype = mx.float16
+loops = 10
+
+
+def qmm_(x, wq1, wq2, q_type):
+    for i in range(loops):
+        x = mx.quantized_matmul(
+            x,
+            *wq1,
+            group_size=group_size,
+            bits=bits,
+            quantization_type=q_type,
+        )
+        x = mx.quantized_matmul(
+            x,
+            *wq2,
+            group_size=group_size,
+            bits=bits,
+            quantization_type=q_type,
+        )
+    return x
+
+
+def affine_qmm(x, wq1, wq2):
+    return qmm_(x, wq1, wq2, "affine")
+
+
+def affine_packed_qmm(x, wq1, wq2):
+    return qmm_(x, wq1, wq2, "affine-packed")
+
+
+def time_qmm():
+    mx.random.seed(3)
+    x = mx.random.normal(shape=(B, D)).astype(dtype)
+    w1 = mx.random.normal(shape=(M, D)).astype(dtype)
+    wq1 = mx.quantize(w1, group_size=group_size, bits=bits, quantization_type="affine")
+    w2 = mx.random.normal(shape=(D, M)).astype(dtype)
+    wq2 = mx.quantize(w2, group_size=group_size, bits=bits, quantization_type="affine")
+    mx.eval(x, wq1, wq2)
+    time_fn(affine_qmm, x, wq1, wq2)
+
+
+def time_packed_qmm():
+    mx.random.seed(3)
+    x = mx.random.normal(shape=(B, D)).astype(dtype)
+    w1 = mx.random.normal(shape=(M, D)).astype(dtype)
+    wq1 = mx.quantize(
+        w1, group_size=group_size, bits=bits, quantization_type="affine-packed"
+    )
+    w2 = mx.random.normal(shape=(D, M)).astype(dtype)
+    wq2 = mx.quantize(
+        w2, group_size=group_size, bits=bits, quantization_type="affine-packed"
+    )
+    mx.eval(x, wq1, wq2)
+    time_fn(affine_packed_qmm, x, wq1, wq2)
+
+
+if __name__ == "__main__":
+    for b in [2, 4, 8]:
+        bits = b
+        print(f"Bits {bits}:")
+        time_qmm()
+        time_packed_qmm()
--- a/benchmarks/python/sdpa_vector_bench.py
+++ b/benchmarks/python/sdpa_vector_bench.py
@@ -8,44 +8,30 @@ L = 16384
 H = 32
 H_k = H // 4
 D = 128
-V = 128
 dtype = mx.float16
 loops = 10


-def upproject(x, w):
-    if w is None:
-        return x
-    else:
-        return x @ w.T
-
-
-def attention(q, k, v, mask=None, w=None):
+def attention(q, k, v):
    def _sdpa(q, k, v):
        B, Hq, L, D = q.shape
        _, Hk, S, _ = k.shape
-        _, _, _, V = v.shape
        q = q.reshape(B, Hk, Hq // Hk, L, D)
        k = k[:, :, None, :, :]
        v = v[:, :, None, :, :]
        s = q @ k.transpose(0, 1, 2, 4, 3)
-        if mask is not None:
-            m = mx.broadcast_to(mask, (B, Hq, L, S)).reshape(B, Hk, Hq // Hk, L, S)
-            s = mx.where(m, s, mx.finfo(s.dtype).min)
        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
        o = p @ v
-        return o.reshape(B, Hq, L, V)
+        return o.reshape(B, Hq, L, D)

    for i in range(loops):
        q = _sdpa(q, k, v)
-        q = upproject(q, w)
    return q


-def sdpa(q, k, v, mask=None, w=None):
+def sdpa(q, k, v):
    for i in range(loops):
-        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
-        q = upproject(q, w)
+        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0)
    return q


@@ -53,43 +39,20 @@ def time_self_attention_primitives():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mx.eval(q, k, v, w)
-    time_fn(attention, q, k, v, w=w)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(attention, q, k, v)


 def time_self_attention_sdpa():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mx.eval(q, k, v, w)
-    time_fn(sdpa, q, k, v, w=w)
-
-
-def time_self_attention_sdpa_with_mask():
-    mx.random.seed(3)
-    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
-    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mask = mx.full((L,), True)
-    mask[L // 2 :] = False
-    mx.eval(q, k, v, mask, w)
-
-    def sdpa_mask(*args):
-        return sdpa(*args, mask=mask, w=w)
-
-    def attention_mask(*args):
-        return attention(*args, mask=mask, w=w)
-
-    time_fn(attention_mask, q, k, v)
-    time_fn(sdpa_mask, q, k, v)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(sdpa, q, k, v)


 if __name__ == "__main__":
    time_self_attention_sdpa()
    time_self_attention_primitives()
-    time_self_attention_sdpa_with_mask()
--- a/benchmarks/python/synchronize_bench.py
+++ b/benchmarks/python/synchronize_bench.py
@@ -1,55 +0,0 @@
-import time
-
-import mlx.core as mx
-
-rank = mx.distributed.init().rank()
-
-
-def timeit(fn, a):
-
-    # warmup
-    for _ in range(5):
-        mx.eval(fn(a))
-
-    its = 10
-    tic = time.perf_counter()
-    for _ in range(its):
-        mx.eval(fn(a))
-    toc = time.perf_counter()
-    ms = 1000 * (toc - tic) / its
-    return ms
-
-
-def all_reduce_benchmark():
-    a = mx.ones((5, 5), mx.int32)
-
-    its_per_eval = 100
-
-    def fn(x):
-        for _ in range(its_per_eval):
-            x = mx.distributed.all_sum(x)
-            x = x - 1
-        return x
-
-    ms = timeit(fn, a) / its_per_eval
-    if rank == 0:
-        print(f"All Reduce: time per iteration {ms:.6f} (ms)")
-
-
-def all_gather_benchmark():
-    a = mx.ones((5, 5), mx.int32)
-    its_per_eval = 100
-
-    def fn(x):
-        for _ in range(its_per_eval):
-            x = mx.distributed.all_gather(x)[0]
-        return x
-
-    ms = timeit(fn, a) / its_per_eval
-    if rank == 0:
-        print(f"All gather: time per iteration {ms:.6f} (ms)")
-
-
-if __name__ == "__main__":
-    all_reduce_benchmark()
-    all_gather_benchmark()
--- a/docs/src/dev/mlx_in_cpp.rst
+++ b/docs/src/dev/mlx_in_cpp.rst
@@ -1,121 +0,0 @@
-.. _mlx_in_cpp:
-
-Using MLX in C++
-================
-
-You can use MLX in a C++ project with CMake.
-
-.. note::
-
-  This guide is based one the following `example using MLX in C++ 
-  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_
-
-First install MLX:
-
-.. code-block:: bash
-
-  pip install -U mlx
-
-You can also install the MLX Python package from source or just the C++
-library. For more information see the :ref:`documentation on installing MLX
-<build_and_install>`.
-
-Next make an example program in ``example.cpp``: 
-
-.. code-block:: C++
-
-  #include <iostream>
-
-  #include "mlx/mlx.h"
-
-  namespace mx = mlx::core;
-
-  int main() {
-    auto x = mx::array({1, 2, 3});
-    auto y = mx::array({1, 2, 3});
-    std::cout << x + y << std::endl;
-    return 0;
-  }
-
-The next step is to setup a CMake file in ``CMakeLists.txt``:
-
-.. code-block:: cmake
-
-  cmake_minimum_required(VERSION 3.27)
-
-  project(example LANGUAGES CXX)
-
-  set(CMAKE_CXX_STANDARD 17)
-  set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-
-Depending on how you installed MLX, you may need to tell CMake where to
-find it. 
-
-If you installed MLX with Python, then add the following to the CMake file:
-
-.. code-block:: cmake
-
-  find_package(
-    Python 3.9
-    COMPONENTS Interpreter Development.Module
-    REQUIRED)
-  execute_process(
-    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    OUTPUT_VARIABLE MLX_ROOT)
-
-If you installed the MLX C++ package to a system path, then CMake should be
-able to find it. If you installed it to a non-standard location or CMake can't
-find MLX then set ``MLX_ROOT`` to the location where MLX is installed:
-
-.. code-block:: cmake
-
-  set(MLX_ROOT "/path/to/mlx/")
-
-Next, instruct CMake to find MLX:
-
-.. code-block:: cmake
-
-  find_package(MLX CONFIG REQUIRED)
-
-Finally, add the ``example.cpp`` program as an executable and link MLX.
-
-.. code-block:: cmake
-
-  add_executable(example example.cpp)
-  target_link_libraries(example PRIVATE mlx)
-
-You can build the example with:
-
-.. code-block:: bash
-
-  cmake -B build -DCMAKE_BUILD_TYPE=Release
-  cmake --build build
-
-And run it with:
-
-.. code-block:: bash
-
-  ./build/example
-
-Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:
-
-.. list-table:: Package Variables
-   :widths: 20 20 
-   :header-rows: 1
-
-   * - Variable 
-     - Description 
-   * - MLX_FOUND
-     - ``True`` if MLX is found
-   * - MLX_INCLUDE_DIRS
-     - Include directory
-   * - MLX_LIBRARIES
-     - Libraries to link against
-   * - MLX_CXX_FLAGS
-     - Additional compiler flags
-   * - MLX_BUILD_ACCELERATE
-     - ``True`` if MLX was built with Accelerate 
-   * - MLX_BUILD_METAL
-     - ``True`` if MLX was built with Metal
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -45,7 +45,6 @@ are the CPU and GPU.
   usage/numpy
   usage/distributed
   usage/using_streams
-   usage/export

 .. toctree::
   :caption: Examples
@@ -62,7 +61,6 @@ are the CPU and GPU.
   python/array
   python/data_types
   python/devices_and_streams
-   python/export
   python/ops
   python/random
   python/transforms
@@ -88,4 +86,3 @@ are the CPU and GPU.
   dev/extensions
   dev/metal_debugger
   dev/custom_metal_kernels
-   dev/mlx_in_cpp
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,5 +1,3 @@
-.. _build_and_install:
-
 Build and Install
 =================

@@ -55,7 +53,7 @@ Build Requirements
 ^^^^^^^^^^^^^^^^^^

 - A C++ compiler with C++17 support (e.g. Clang >= 5.0)
- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
+- `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
 - Xcode >= 15.0 and macOS SDK >= 14.0

 .. note::
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -66,4 +66,3 @@ documentation for more information. Use :func:`issubdtype` to determine if one
   Dtype
   DtypeCategory
   issubdtype
-   finfo
--- a/docs/src/python/export.rst
+++ b/docs/src/python/export.rst
@@ -1,14 +0,0 @@
-.. _export:
-
-Export Functions
-================
-
-.. currentmodule:: mlx.core
-
-.. autosummary::
-  :toctree: _autosummary
-
-   export_function
-   import_function
-   exporter
-   export_to_dot
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -89,7 +89,6 @@ Operations
   isneginf
   isposinf
   issubdtype
-   kron
   left_shift
   less
   less_equal
@@ -145,8 +144,6 @@ Operations
   sign
   sin
   sinh
-   slice
-   slice_update
   softmax
   sort
   split
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -421,77 +421,3 @@ the most opportunity to optimize the computation graph:
  # Compiling the outer function is good to do as it will likely
  # be faster even though the inner functions are compiled
  fun = mx.compile(outer)
-
-
-
-.. _shapeless_compile:
-
-Shapeless Compilation
---------------------
-
-When the shape of an input to a compiled function changes, the function is
-recompiled. You can compile a function once and run it on inputs with
-variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
-case changes to the shapes of the inputs do not cause the function to be
-recompiled.
-
-.. code-block:: python
-
-  def fun(x, y):
-      return mx.abs(x + y)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.array(1.0)
-  y = mx.array(-2.0)
-
-  # Firt call compiles the function
-  print(compiled_fun(x, y))
-
-  # Second call with different shapes
-  # does not recompile the function
-  x = mx.array([1.0, -6.0])
-  y = mx.array([-2.0, 3.0])
-  print(compiled_fun(x, y))
-
-
-Use shapeless compilations carefully. Since compilation is not triggered when
-shapes change, any graphs which are conditional on the input shapes will not
-work as expected. Shape-dependent computations are common and sometimes subtle
-to detect. For example:
-
-.. code-block:: python
-
-  def fun(x):
-      return x.reshape(x.shape[0] * x.shape[1], -1)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.random.uniform(shape=(2, 3, 4))
-
-  out = compiled_fun(x)
-
-  x = mx.random.uniform(shape=(5, 5, 3))
-
-  # Error, can't reshape (5, 5, 3) to (6, -1)
-  out = compiled_fun(x)
-
-The second call to the ``compiled_fun`` fails because of the call to
-:func:`reshape` which uses the static shape of ``x`` in the first call. We can
-fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:
-
-.. code-block:: python
-
-  def fun(x):
-      return x.flatten(0, 1)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.random.uniform(shape=(2, 3, 4))
-
-  out = compiled_fun(x)
-
-  x = mx.random.uniform(shape=(5, 5, 3))
-
-  # Ok
-  out = compiled_fun(x)
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -57,7 +57,7 @@ with the Anaconda package manager as follows:

 .. code:: shell

-    $ conda install conda-forge::openmpi
+    $ conda install openmpi

 Installing with Homebrew may require specifying the location of ``libmpi.dyld``
 so that MLX can find it and load it at runtime. This can simply be achieved by
@@ -141,13 +141,12 @@ everything else remaining the same.
    from mlx.utils import tree_map

    def all_reduce_grads(grads):
-        N = mx.distributed.init().size()
+        N = mx.distributed.init()
        if N == 1:
            return grads
        return tree_map(
-            lambda x: mx.distributed.all_sum(x) / N,
-            grads
-        )
+                lambda x: mx.distributed.all_sum(x) / N,
+                grads)

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -1,288 +0,0 @@
-.. _export_usage:
-
-Exporting Functions
-===================
-
-.. currentmodule:: mlx.core
-
-MLX has an API to export and import functions to and from a file. This lets you
-run computations written in one MLX front-end (e.g. Python) in another MLX
-front-end (e.g. C++). 
-
-This guide walks through the basics of the MLX export API with some examples.
-To see the full list of functions check-out the :ref:`API documentation
-<export>`.
-
-Basics of Exporting 
-------------------
-
-Let's start with a simple example:
- 
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-  mx.export_function("add.mlxfn", fun, x, y)
-
-To export a function, provide sample input arrays that the function
-can be called with. The data doesn't matter, but the shapes and types of the
-arrays do. In the above example we exported ``fun`` with two ``float32``
-scalar arrays. We can then import the function and run it:
-
-.. code-block:: python
-
-  add_fun = mx.import_function("add.mlxfn")
-
-  out, = add_fun(mx.array(1.0), mx.array(2.0))
-  # Prints: array(3, dtype=float32)
-  print(out)
-
-  out, = add_fun(mx.array(1.0), mx.array(3.0))
-  # Prints: array(4, dtype=float32)
-  print(out)
-
-  # Raises an exception
-  add_fun(mx.array(1), mx.array(3.0))
-
-  # Raises an exception
-  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))
-
-Notice the third and fourth calls to ``add_fun`` raise exceptions because the
-shapes and types of the inputs are different than the shapes and types of the
-example inputs we exported the function with.
-
-Also notice that even though the original ``fun`` returns a single output
-array, the imported function always returns a tuple of one or more arrays.
-
-The inputs to :func:`export_function` and to an imported function can be
-specified as variable positional arguments or as a tuple of arrays:
-
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-   
-  # Both arguments to fun are positional
-  mx.export_function("add.mlxfn", fun, x, y)
-
-  # Same as above
-  mx.export_function("add.mlxfn", fun, (x, y))
-
-  imported_fun = mx.import_function("add.mlxfn")
-
-  # Ok
-  out, = imported_fun(x, y)
-
-  # Also ok
-  out, = imported_fun((x, y))
-
-You can pass example inputs to functions as positional or keyword arguments. If
-you use keyword arguments to export the function, then you have to use the same
-keyword arguments when calling the imported function.
-
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  # One argument to fun is positional, the other is a kwarg
-  mx.export_function("add.mlxfn", fun, x, y=y)
-
-  imported_fun = mx.import_function("add.mlxfn")
-
-  # Ok
-  out, = imported_fun(x, y=y)
-
-  # Also ok
-  out, = imported_fun((x,), {"y": y})
-
-  # Raises since the keyword argument is missing
-  out, = imported_fun(x, y)
-
-  # Raises since the keyword argument has the wrong key
-  out, = imported_fun(x, z=y)
-
-
-Exporting Modules
-----------------
-
-An :obj:`mlx.nn.Module` can be exported with or without the parameters included
-in the exported function. Here's an example:
-
-.. code-block:: python
-
-   model = nn.Linear(4, 4)
-   mx.eval(model.parameters())
-
-   def call(x):
-      return model(x)
-
-   mx.export_function("model.mlxfn", call, mx.zeros(4))
-
-In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
-parameters are also saved to the ``model.mlxfn`` file.
-
-.. note::
-
-   For enclosed arrays inside an exported function, be extra careful to ensure
-   they are evaluated. The computation graph that gets exported will include
-   the computation that produces enclosed inputs.
-  
-   If the above example was missing ``mx.eval(model.parameters()``, the
-   exported function would include the random initialization of the
-   :obj:`mlx.nn.Module` parameters.
-
-If you only want to export the ``Module.__call__`` function without the
-parameters, pass them as inputs to the ``call`` wrapper:
-
-.. code-block:: python
-
-   model = nn.Linear(4, 4)
-   mx.eval(model.parameters())
-
-   def call(x, **params):
-     # Set the model's parameters to the input parameters
-     model.update(tree_unflatten(list(params.items())))
-     return model(x)
- 
-   params = dict(tree_flatten(model.parameters()))
-   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)
-
-
-Shapeless Exports
-----------------
-
-Just like :func:`compile`, functions can also be exported for dynamically shaped
-inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
-to export a function which can be used for inputs with variable shapes:
-
-.. code-block:: python
-
-  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
-  imported_abs = mx.import_function("fun.mlxfn")
-
-  # Ok
-  out, = imported_abs(mx.array(-1.0))
-  
-  # Also ok 
-  out, = imported_abs(mx.array([-1.0, -2.0]))
-
-With ``shapeless=False`` (which is the default), the second call to
-``imported_abs`` would raise an exception with a shape mismatch.
-
-Shapeless exporting works the same as shapeless compilation and should be
-used carefully. See the :ref:`documentation on shapeless compilation
-<shapeless_compile>` for more information.
-
-Exporting Multiple Traces
-------------------------
-
-In some cases, functions build different computation graphs for different
-input arguments. A simple way to manage this is to export to a new file with
-each set of inputs. This is a fine option in many cases. But it can be
-suboptimal if the exported functions have a large amount of duplicate constant
-data (for example the parameters of a :obj:`mlx.nn.Module`).
-
-The export API in MLX lets you export multiple traces of the same function to
-a single file by creating an exporting context manager with :func:`exporter`:
-
-.. code-block:: python
-
-  def fun(x, y=None):
-      constant = mx.array(3.0)
-      if y is not None:
-        x += y 
-      return x + constant
-
-  with mx.exporter("fun.mlxfn", fun) as exporter:
-      exporter(mx.array(1.0))
-      exporter(mx.array(1.0), y=mx.array(0.0))
-
-  imported_function = mx.import_function("fun.mlxfn")
-
-  # Call the function with y=None
-  out, = imported_function(mx.array(1.0))
-  print(out)
-
-  # Call the function with y specified
-  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
-  print(out)
-
-In the above example the function constant data, (i.e. ``constant``), is only
-saved once. 
-
-Transformations with Imported Functions
---------------------------------------
-
-Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
-on imported functions just like regular Python functions:
-
-.. code-block:: python
-
-  def fun(x):
-      return mx.sin(x)
-
-  x = mx.array(0.0)
-  mx.export_function("sine.mlxfn", fun, x)
-
-  imported_fun = mx.import_function("sine.mlxfn")
-
-  # Take the derivative of the imported function
-  dfdx = mx.grad(lambda x: imported_fun(x)[0])
-  # Prints: array(1, dtype=float32)
-  print(dfdx(x))
-
-  # Compile the imported function 
-  mx.compile(imported_fun)
-  # Prints: array(0, dtype=float32)
-  print(compiled_fun(x)[0])
-
-
-Importing Functions in C++
--------------------------
-
-Importing and running functions in C++ is basically the same as importing and
-running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
-setup a simple C++ project that uses MLX as a library.
-
-Next, export a simple function from Python:
-
-.. code-block:: python
-
-  def fun(x, y):
-      return mx.exp(x + y)
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-  mx.export_function("fun.mlxfn", fun, x, y)
-
-
-Import and run the function in C++ with only a few lines of code:
-
-.. code-block:: c++
-
-  auto fun = mx::import_function("fun.mlxfn");
-
-  auto inputs = {mx::array(1.0), mx::array(1.0)};
-  auto outputs = fun(inputs);
-
-  // Prints: array(2, dtype=float32)
-  std::cout << outputs[0] << std::endl;
-
-Imported functions can be transformed in C++ just like in Python. Use 
-``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
-mx::array>`` for keyword arguments when calling imported functions in C++.
-
-More Examples
-------------
-
-Here are a few more complete examples exporting more complex functions from
-Python and importing and running them in C++:
-
-* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_
--- a/examples/cmake_project/CMakeLists.txt
+++ b/examples/cmake_project/CMakeLists.txt
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.27)
-
-project(example LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Comment the following two commands only the MLX C++ library is installed and
-# set(MLX_ROOT "/path/to/mlx") directly if needed.
-find_package(
-  Python 3.9
-  COMPONENTS Interpreter Development.Module
-  REQUIRED)
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE MLX_ROOT)
-
-find_package(MLX CONFIG REQUIRED)
-
-add_executable(example example.cpp)
-target_link_libraries(example PRIVATE mlx)
--- a/examples/cmake_project/README.md
+++ b/examples/cmake_project/README.md
@@ -1,26 +0,0 @@
-## Build and Run 
-
-Install MLX with Python:
-
-```bash
-pip install mlx>=0.22
-```
-
-Build the C++ example:
-
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build
-```
-
-Run the C++ example:
-
-```
-./build/example
-```
-
-which should output:
-
-```
-array([2, 4, 6], dtype=int32)
-```
--- a/examples/cmake_project/example.cpp
+++ b/examples/cmake_project/example.cpp
@@ -1,14 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <iostream>
-
-#include "mlx/mlx.h"
-
-namespace mx = mlx::core;
-
-int main() {
-  auto x = mx::array({1, 2, 3});
-  auto y = mx::array({1, 2, 3});
-  std::cout << x + y << std::endl;
-  return 0;
-}
--- a/examples/export/CMakeLists.txt
+++ b/examples/export/CMakeLists.txt
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.27)
-
-project(import_mlx LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-find_package(
-  Python 3.9
-  COMPONENTS Interpreter Development.Module
-  REQUIRED)
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE MLX_ROOT)
-find_package(MLX CONFIG REQUIRED)
-
-add_executable(eval_mlp eval_mlp.cpp)
-target_link_libraries(eval_mlp PRIVATE mlx)
-
-add_executable(train_mlp train_mlp.cpp)
-target_link_libraries(train_mlp PRIVATE mlx)
--- a/examples/export/README.md
+++ b/examples/export/README.md
@@ -1,49 +0,0 @@
-## Setup
-
-Install MLX:
-
-```bash
-pip install mlx>=0.22
-```
-
-Build the C++ examples:
-
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build
-```
-
-## Run
-
-### Eval MLP
-
-Run the Python script to export the eval function:
-
-```bash
-python eval_mlp.py
-```
-
-Then run the C++ program to import and run the function:
-
-```
-./build/eval_mlp
-```
-
-The Python and C++ programs should output the same result.
-
-### Train MLP
-
-Run the Python script to export the model initialization and training
-functions:
-
-```bash
-python train_mlp.py
-```
-
-Then run the C++ program to import and run the functions:
-
-```
-./build/train_mlp
-```
-
-The Python and C++ programs should output the same results.
--- a/examples/export/eval_mlp.cpp
+++ b/examples/export/eval_mlp.cpp
@@ -1,25 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <mlx/mlx.h>
-#include <iostream>
-
-namespace mx = mlx::core;
-
-int main() {
-  int batch_size = 8;
-  int input_dim = 32;
-
-  // Make the input
-  mx::random::seed(42);
-  auto example_x = mx::random::uniform({batch_size, input_dim});
-
-  // Import the function
-  auto forward = mx::import_function("eval_mlp.mlxfn");
-
-  // Call the imported function
-  auto out = forward({example_x})[0];
-
-  std::cout << out << std::endl;
-
-  return 0;
-}
--- a/examples/export/eval_mlp.py
+++ b/examples/export/eval_mlp.py
@@ -1,52 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.utils
-
-
-class MLP(nn.Module):
-    """A simple MLP."""
-
-    def __init__(
-        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
-    ):
-        super().__init__()
-        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
-        self.layers = [
-            nn.Linear(idim, odim)
-            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
-        ]
-
-    def __call__(self, x):
-        for l in self.layers[:-1]:
-            x = nn.relu(l(x))
-        return self.layers[-1](x)
-
-
-if __name__ == "__main__":
-
-    batch_size = 8
-    input_dim = 32
-    output_dim = 10
-
-    # Load the model
-    mx.random.seed(0)  # Seed for params
-    model = MLP(num_layers=5, input_dim=input_dim, hidden_dim=64, output_dim=output_dim)
-    mx.eval(model)
-
-    # Note, the model parameters are saved in the export function
-    def forward(x):
-        return model(x)
-
-    mx.random.seed(42)  # Seed for input
-    example_x = mx.random.uniform(shape=(batch_size, input_dim))
-
-    mx.export_function("eval_mlp.mlxfn", forward, example_x)
-
-    # Import in Python
-    imported_forward = mx.import_function("eval_mlp.mlxfn")
-    expected = forward(example_x)
-    (out,) = imported_forward(example_x)
-    assert mx.allclose(expected, out)
-    print(out)
--- a/examples/export/train_mlp.cpp
+++ b/examples/export/train_mlp.cpp
@@ -1,35 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <mlx/mlx.h>
-#include <iostream>
-
-namespace mx = mlx::core;
-
-int main() {
-  int batch_size = 8;
-  int input_dim = 32;
-  int output_dim = 10;
-
-  auto state = mx::import_function("init_mlp.mlxfn")({});
-
-  // Make the input
-  mx::random::seed(42);
-  auto example_X = mx::random::normal({batch_size, input_dim});
-  auto example_y = mx::random::randint(0, output_dim, {batch_size});
-
-  // Import the function
-  auto step = mx::import_function("train_mlp.mlxfn");
-
-  // Call the imported function
-  for (int it = 0; it < 100; ++it) {
-    state.insert(state.end(), {example_X, example_y});
-    state = step(state);
-    eval(state);
-    auto loss = state.back();
-    state.pop_back();
-    if (it % 10 == 0) {
-      std::cout << "Loss " << loss.item<float>() << std::endl;
-    }
-  }
-  return 0;
-}
--- a/examples/export/train_mlp.py
+++ b/examples/export/train_mlp.py
@@ -1,76 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.optimizers as optim
-import mlx.utils
-
-
-class MLP(nn.Module):
-    """A simple MLP."""
-
-    def __init__(
-        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
-    ):
-        super().__init__()
-        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
-        self.layers = [
-            nn.Linear(idim, odim)
-            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
-        ]
-
-    def __call__(self, x):
-        for l in self.layers[:-1]:
-            x = nn.relu(l(x))
-        return self.layers[-1](x)
-
-
-if __name__ == "__main__":
-
-    batch_size = 8
-    input_dim = 32
-    output_dim = 10
-
-    def init():
-        # Seed for the parameter initialization
-        mx.random.seed(0)
-        model = MLP(
-            num_layers=3, input_dim=input_dim, hidden_dim=64, output_dim=output_dim
-        )
-        optimizer = optim.SGD(learning_rate=1e-1)
-        optimizer.init(model.parameters())
-        state = [model.parameters(), optimizer.state]
-        tree_structure, state = zip(*mlx.utils.tree_flatten(state))
-        return model, optimizer, tree_structure, state
-
-    # Export the model parameter initialization
-    model, optimizer, tree_structure, state = init()
-    mx.eval(state)
-    mx.export_function("init_mlp.mlxfn", lambda: init()[-1])
-
-    def loss_fn(params, X, y):
-        model.update(params)
-        return nn.losses.cross_entropy(model(X), y, reduction="mean")
-
-    def step(*inputs):
-        *state, X, y = inputs
-        params, opt_state = mlx.utils.tree_unflatten(list(zip(tree_structure, state)))
-        optimizer.state = opt_state
-        loss, grads = mx.value_and_grad(loss_fn)(params, X, y)
-        params = optimizer.apply_gradients(grads, params)
-        _, state = zip(*mlx.utils.tree_flatten([params, optimizer.state]))
-        return *state, loss
-
-    # Make some random data
-    mx.random.seed(42)
-    example_X = mx.random.normal(shape=(batch_size, input_dim))
-    example_y = mx.random.randint(low=0, high=output_dim, shape=(batch_size,))
-    mx.export_function("train_mlp.mlxfn", step, *state, example_X, example_y)
-
-    # Export one step of SGD
-    imported_step = mx.import_function("train_mlp.mlxfn")
-
-    for it in range(100):
-        *state, loss = imported_step(*state, example_X, example_y)
-        if it % 10 == 0:
-            print(f"Loss {loss.item():.6}")
--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -18,7 +18,8 @@ find_package(
 execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE nanobind_ROOT)
+  OUTPUT_VARIABLE NB_DIR)
+list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
 find_package(nanobind CONFIG REQUIRED)

 # ----------------------------- Extensions -----------------------------
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -6,7 +6,6 @@

 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
 #include "mlx/utils.h"

 #include "axpby/axpby.h"
@@ -59,9 +58,9 @@ mx::array axpby(
  // Construct the array as the output of the Axpby primitive
  // with the broadcasted and upcasted arrays as inputs
  return mx::array(
-      /* const mx::Shape& shape = */ out_shape,
+      /* const std::vector<int>& shape = */ out_shape,
      /* mx::Dtype dtype = */ out_dtype,
-      /* std::shared_ptr<mx::Primitive> primitive = */
+      /* std::unique_ptr<mx::Primitive> primitive = */
      std::make_shared<Axpby>(to_stream(s), alpha, beta),
      /* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
 }
@@ -280,7 +279,7 @@ void Axpby::eval_gpu(
  if (!contiguous_kernel) {
    compute_encoder.set_vector_bytes(x.shape(), 5);
    compute_encoder.set_vector_bytes(x.strides(), 6);
-    compute_encoder.set_vector_bytes(y.strides(), 7);
+    compute_encoder.set_bytes(y.strides(), 7);
    compute_encoder.set_bytes(ndim, 8);
  }

--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -1,8 +1,8 @@
 [build-system]
 requires = [
  "setuptools>=42",
-  "cmake>=3.25",
+  "cmake>=3.24",
  "mlx>=0.18.0",
-  "nanobind==2.4.0",
+  "nanobind==2.2.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
-cmake>=3.25
+cmake>=3.24
 mlx>=0.21.0
 nanobind==2.2.0
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,7 +5,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
@@ -29,16 +28,21 @@ if(WIN32)
  set_target_properties(mlx PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 endif()

-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
-
 if(MLX_BUILD_CPU)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cpu)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
 endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
+if(MLX_BUILD_ACCELERATE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
+elseif(MLX_BUILD_CPU)
+  target_sources(
+    mlx
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/common/default_primitives.cpp)
+endif()

 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -10,8 +10,22 @@

 namespace mlx::core {

+namespace {
+
+/** Return true if we are currently performing a function transformation in
+ * order to keep the graph when evaluating tracer arrays. */
+bool in_tracing() {
+  return detail::InTracing::in_tracing();
+}
+
+bool retain_graph() {
+  return detail::RetainGraph::retain_graph();
+}
+
+} // namespace
+
 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
-    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
  auto cval = static_cast<complex64_t>(val);
  init(&cval);
 }
@@ -47,14 +61,14 @@ std::vector<array> array::make_arrays(

 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          float32)) {
  init(data.begin());
 }

 array::array(std::initializer_list<int> data, Dtype dtype)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          dtype)) {
  init(data.begin());
 }
@@ -105,8 +119,7 @@ void array::eval() {
 }

 bool array::is_tracer() const {
-  return (array_desc_->is_tracer && detail::in_tracing()) ||
-      detail::retain_graph();
+  return array_desc_->is_tracer && in_tracing() || retain_graph();
 }

 void array::set_data(allocator::Buffer buffer, Deleter d) {
@@ -264,19 +277,7 @@ array::ArrayDesc::~ArrayDesc() {
    }
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
-      bool is_deletable =
-          (a.array_desc_.use_count() <= a.siblings().size() + 1);
-      // An array with siblings is deletable only if all of its siblings
-      // are deletable
-      for (auto& s : a.siblings()) {
-        if (!is_deletable) {
-          break;
-        }
-        int is_input = (input_map.find(s.id()) != input_map.end());
-        is_deletable &=
-            s.array_desc_.use_count() <= a.siblings().size() + is_input;
-      }
-      if (is_deletable) {
+      if (a.array_desc_.use_count() <= a.siblings().size() + 1) {
        for_deletion.push_back(std::move(a.array_desc_));
      }
    }
@@ -309,7 +310,7 @@ array::ArrayIterator::ArrayIterator(const array& arr, int idx)
 }

 array::ArrayIterator::reference array::ArrayIterator::operator*() const {
-  auto start = Shape(arr.ndim(), 0);
+  auto start = std::vector<int>(arr.ndim(), 0);
  auto end = arr.shape();
  auto shape = arr.shape();
  shape.erase(shape.begin());
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -17,8 +17,7 @@ namespace mlx::core {
 class Primitive;

 using Deleter = std::function<void(allocator::Buffer)>;
-using ShapeElem = int32_t;
-using Shape = std::vector<ShapeElem>;
+using Shape = std::vector<int32_t>;
 using Strides = std::vector<int64_t>;

 class array {
@@ -35,29 +34,29 @@ class array {
  explicit array(const std::complex<float>& val, Dtype dtype = complex64);

  template <typename It>
-  explicit array(
+  array(
      It data,
      Shape shape,
      Dtype dtype =
          TypeToDtype<typename std::iterator_traits<It>::value_type>());

  template <typename T>
-  explicit array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());
+  array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());

  /* Special case so empty lists default to float32. */
-  explicit array(std::initializer_list<float> data);
+  array(std::initializer_list<float> data);

  /* Special case so array({}, type) is an empty array. */
-  explicit array(std::initializer_list<int> data, Dtype dtype);
+  array(std::initializer_list<int> data, Dtype dtype);

  template <typename T>
-  explicit array(
+  array(
      std::initializer_list<T> data,
      Shape shape,
      Dtype dtype = TypeToDtype<T>());

  /* Build an array from a buffer */
-  explicit array(
+  array(
      allocator::Buffer data,
      Shape shape,
      Dtype dtype,
@@ -499,7 +498,7 @@ class array {

 template <typename T>
 array::array(T val, Dtype dtype /* = TypeToDtype<T>() */)
-    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
  init(&val);
 }

@@ -517,7 +516,7 @@ array::array(
    std::initializer_list<T> data,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          dtype)) {
  init(data.begin());
 }
--- a/mlx/backend/accelerate/CMakeLists.txt
+++ b/mlx/backend/accelerate/CMakeLists.txt
@@ -0,0 +1,8 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp)
--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@@ -0,0 +1,20 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
+  eval(inputs, out);
+
+  // TODO: Add accelerate based optimizations for CPU conv
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -0,0 +1,253 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+
+#include "mlx/backend/accelerate/utils.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+namespace {
+
+std::tuple<bool, size_t, array> check_transpose(const array& arr) {
+  auto stx = arr.strides()[arr.ndim() - 2];
+  auto sty = arr.strides()[arr.ndim() - 1];
+  if (stx == arr.shape(-1) && sty == 1) {
+    return std::make_tuple(false, stx, arr);
+  } else if (stx == 1 && sty == arr.shape(-2)) {
+    return std::make_tuple(true, sty, arr);
+  } else {
+    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+    copy(arr, arr_copy, CopyType::General);
+    size_t stx = arr.shape(-1);
+    return std::make_tuple(false, stx, arr_copy);
+  }
+}
+
+inline void matmul_cblas_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    cblas_sgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        alpha, // alpha
+        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        lda,
+        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        ldb,
+        beta, // beta
+        out.data<float>() + M * N * i,
+        out.shape(-1) // ldc
+    );
+  }
+}
+
+inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_cblas_general(a_pre, b_pre, out);
+}
+
+inline void matmul_bnns_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  // TODO: Update to utilize BNNS broadcasting
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());
+
+  const BNNSLayerParametersBroadcastMatMul gemm_params{
+      /* float alpha = */ alpha,
+      /* float beta = */ beta,
+      /* bool transA = */ a_transposed,
+      /* bool transB = */ b_transposed,
+      /* bool quadratic = */ false,
+      /* bool a_is_weights = */ false,
+      /* bool b_is_weights = */ false,
+      /* BNNSNDArrayDescriptor iA_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {lda, (M * K) / lda, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, lda, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+      /* BNNSNDArrayDescriptor iB_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {ldb, (K * N) / ldb, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, ldb, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+      /* BNNSNDArrayDescriptor o_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {N, M, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, N, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+  };
+
+  auto bnns_filter =
+      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    BNNSFilterApplyTwoInput(
+        bnns_filter,
+        a.data<uint8_t>() +
+            elem_to_loc(M * K * i, a.shape(), a.strides()) * a.itemsize(),
+        b.data<uint8_t>() +
+            elem_to_loc(K * N * i, b.shape(), b.strides()) * b.itemsize(),
+        out.data<uint8_t>() + M * N * i * out.itemsize());
+  }
+
+  BNNSFilterDestroy(bnns_filter);
+}
+
+inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
+  // TODO: Update to utilize BNNS broadcasting
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_bnns_general(a_pre, b_pre, out);
+}
+
+template <typename T>
+inline void mask_matrix(
+    T* data,
+    const bool* mask,
+    int tile_size,
+    const int X,
+    const int Y,
+    const size_t X_data_str,
+    const size_t Y_data_str,
+    const size_t X_mask_str,
+    const size_t Y_mask_str) {
+  int tX = (X + tile_size - 1) / tile_size;
+  int tY = (Y + tile_size - 1) / tile_size;
+
+  for (int i = 0; i < tX; i++) {
+    for (int j = 0; j < tY; j++) {
+      bool do_mask = mask[i * X_mask_str + j * Y_mask_str];
+      if (!do_mask) {
+        int loc_x = i * tile_size;
+        int loc_y = j * tile_size;
+        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
+
+        int size_x = std::min(tile_size, X - loc_x);
+        int size_y = std::min(tile_size, Y - loc_y);
+        for (int ii = 0; ii < size_x; ii++) {
+          for (int jj = 0; jj < size_y; jj++) {
+            data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() == float32) {
+    return matmul_cblas(inputs[0], inputs[1], out);
+  }
+  return matmul_bnns(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  if (out.dtype() == float32) {
+    return matmul_cblas_general(inputs[0], inputs[1], out, alpha_, beta_);
+  }
+  return matmul_bnns_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -0,0 +1,602 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+#include <cmath>
+
+#include <Accelerate/Accelerate.h>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/unary.h"
+#include "mlx/primitives.h"
+
+#define DEFAULT(primitive)                                                 \
+  void primitive::eval_cpu(const std::vector<array>& inputs, array& out) { \
+    primitive::eval(inputs, out);                                          \
+  }
+
+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
+namespace mlx::core {
+
+// Use the default implementation for the following primitives
+DEFAULT(Arange)
+DEFAULT(ArgPartition)
+DEFAULT(ArgReduce)
+DEFAULT(ArgSort)
+DEFAULT(AsStrided)
+DEFAULT(BlockMaskedMM)
+DEFAULT(Broadcast)
+DEFAULT(Ceil)
+DEFAULT(Concatenate)
+DEFAULT(Conjugate)
+DEFAULT(Copy)
+DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(Depends)
+DEFAULT_MULTI(DivMod)
+DEFAULT(NumberOfElements)
+DEFAULT(Equal)
+DEFAULT(Erf)
+DEFAULT(ErfInv)
+DEFAULT(ExpandDims)
+DEFAULT(FFT)
+DEFAULT(Floor)
+DEFAULT(Gather)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
+DEFAULT(Greater)
+DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
+DEFAULT(Less)
+DEFAULT(LessEqual)
+DEFAULT(Load)
+DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
+DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
+DEFAULT(NotEqual)
+DEFAULT(Pad)
+DEFAULT(Partition)
+DEFAULT_MULTI(QRF)
+DEFAULT(RandomBits)
+DEFAULT(Remainder)
+DEFAULT(Round)
+DEFAULT(Scatter)
+DEFAULT(Select)
+DEFAULT(Sigmoid)
+DEFAULT(Sign)
+DEFAULT(Slice)
+DEFAULT(SliceUpdate)
+DEFAULT_MULTI(Split)
+DEFAULT(Sort)
+DEFAULT(Squeeze)
+DEFAULT(StopGradient)
+DEFAULT_MULTI(SVD)
+DEFAULT(Transpose)
+DEFAULT(Inverse)
+DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)
+
+void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
+  } else if (in.dtype() == int32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, in.data_size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x + y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsadd((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsadd((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vadd((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
+        });
+  } else if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x + y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsaddi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsaddi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vaddi((const int*)a, 1, (const int*)b, 1, (int*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcCos::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvacosf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcCosh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvacoshf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcSin::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvasinf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcSinh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvasinhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvatanf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  if (out.dtype() == float32 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    if (a.is_donatable()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
+    int size = a.data_size();
+    vvatan2f(out.data<float>(), a.data<float>(), b.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvatanhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  if (in.flags().contiguous) {
+    // Use accelerate functions if possible
+    if (in.dtype() == float32 && out.dtype() == uint32) {
+      set_unary_output_data(in, out);
+      vDSP_vfixu32(
+          in.data<float>(), 1, out.data<uint32_t>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == float32 && out.dtype() == int32) {
+      set_unary_output_data(in, out);
+      vDSP_vfix32(in.data<float>(), 1, out.data<int32_t>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == uint32 && out.dtype() == float32) {
+      set_unary_output_data(in, out);
+      vDSP_vfltu32(
+          in.data<uint32_t>(), 1, out.data<float>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == int32 && out.dtype() == float32) {
+      set_unary_output_data(in, out);
+      vDSP_vflt32(in.data<int32_t>(), 1, out.data<float>(), 1, in.data_size());
+      return;
+    }
+  }
+  eval(inputs, out);
+}
+
+void Cos::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvcosf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Cosh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvcoshf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x / y; },
+        UseDefaultBinaryOp(),
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsdivi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vdivi((const int*)b, 1, (const int*)a, 1, (int*)o, 1, n);
+        });
+  } else if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x / y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_svdiv((const float*)s, (const float*)vec, 1, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsdiv((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vdiv((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Expm1::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvexpm1f(
+        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  assert(in.dtype() == out.dtype());
+  if (in.data_size() == 1 && out.dtype() == float32) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    vDSP_vfill(in.data<float>(), out.data<float>(), 1, out.size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Log::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    switch (base_) {
+      case Base::e:
+        vvlogf(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+      case Base::two:
+        vvlog2f(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+      case Base::ten:
+        vvlog10f(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvlog1pf(
+        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x * y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsmul((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsmul((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vmul((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  if (out.dtype() == float32 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    int size = a.size();
+    if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
+    vvpowf(out.data<float>(), b.data<float>(), a.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (reduce_type_ == Scan::Sum && out.dtype() == float32 &&
+      in.flags().row_contiguous && in.strides()[axis_] == 1 && !inclusive_) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    int stride = in.shape(axis_);
+    int count = in.size() / stride;
+    const float* input = in.data<float>();
+    float* output = out.data<float>();
+    float s = 1.0;
+    if (!reverse_) {
+      for (int i = 0; i < count; i++) {
+        vDSP_vrsum(input - 1, 1, &s, output, 1, stride);
+        input += stride;
+        output += stride;
+      }
+    } else {
+      for (int i = 0; i < count; i++) {
+        input += stride - 1;
+        output += stride - 1;
+        vDSP_vrsum(input + 1, -1, &s, output, -1, stride);
+        input++;
+        output++;
+      }
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sin::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvsinf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sinh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvsinhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sqrt::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    if (recip_) {
+      vvrsqrtf(out.data<float>(), in.data<float>(), &size);
+    } else {
+      vvsqrtf(out.data<float>(), in.data<float>(), &size);
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x - y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          float minus_1 = -1;
+          vDSP_vsmsa(
+              (const float*)vec, 1, &minus_1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          float val = -(*s);
+          vDSP_vsadd((const float*)vec, 1, &val, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vsub((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
+        });
+  } else if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x - y; },
+        UseDefaultBinaryOp(),
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          int val = -(*s);
+          vDSP_vsaddi((const int*)vec, 1, &val, (int*)o, 1, n);
+        },
+        UseDefaultBinaryOp());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Tan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvtanf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Tanh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvtanhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/quantized.cpp
+++ b/mlx/backend/accelerate/quantized.cpp
@@ -0,0 +1,117 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include <simd/vector.h>
+
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+void _qmm_t_4_64(
+    float* result,
+    const float* x,
+    const uint32_t* w,
+    const float* scales,
+    const float* biases,
+    int M,
+    int N,
+    int K,
+    int B,
+    bool batched_w) {
+  constexpr int bits = 4;
+  constexpr int group_size = 64;
+  constexpr int bitmask = (1 << bits) - 1;
+  constexpr int pack_factor = 32 / bits;
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  int w_els = N * K / pack_factor;
+  int g_els = w_els * pack_factor / group_size;
+
+  for (int i = 0; i < B; i++) {
+    for (int m = 0; m < M; m++) {
+      const uint32_t* w_local = w;
+      const float* scales_local = scales;
+      const float* biases_local = biases;
+
+      for (int n = 0; n < N; n++) {
+        const simd_float16* x_local = (simd_float16*)x;
+        simd_float16 sum = 0;
+        for (int k = 0; k < K; k += group_size) {
+          float scale = *scales_local++;
+          float bias = *biases_local++;
+
+          for (int kw = 0; kw < packs_in_group; kw += 2) {
+            // TODO: vectorize this properly
+            simd_uint16 wi;
+            for (int e = 0; e < 2; e++) {
+              uint32_t wii = *w_local++;
+              for (int p = 0; p < 8; p++) {
+                wi[e * 8 + p] = wii & bitmask;
+                wii >>= bits;
+              }
+            }
+            simd_float16 wf = simd_float(wi);
+            wf *= scale;
+            wf += bias;
+
+            sum += (*x_local) * wf;
+            x_local++;
+          }
+        }
+
+        *result = simd_reduce_add(sum);
+        result++;
+      }
+
+      x += K;
+    }
+    if (batched_w) {
+      w += w_els;
+      scales += g_els;
+      biases += g_els;
+    }
+  }
+}
+
+} // namespace
+
+void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 4);
+
+  auto& x = inputs[0];
+  auto& w = inputs[1];
+  auto& scales = inputs[2];
+  auto& biases = inputs[3];
+
+  bool condition =
+      (transpose_ && x.flags().row_contiguous && w.flags().row_contiguous &&
+       scales.flags().row_contiguous && biases.flags().row_contiguous &&
+       x.dtype() == float32 && bits_ == 4 && group_size_ == 64);
+
+  if (condition) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    int K = x.shape(-1);
+    int M = x.shape(-2);
+    int N = out.shape(-1);
+    int B = x.size() / K / M;
+    bool batched_w = w.ndim() > 2;
+    _qmm_t_4_64(
+        out.data<float>(),
+        x.data<float>(),
+        w.data<uint32_t>(),
+        scales.data<float>(),
+        biases.data<float>(),
+        M,
+        N,
+        K,
+        B,
+        batched_w);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/reduce.cpp
+++ b/mlx/backend/accelerate/reduce.cpp
@@ -0,0 +1,139 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/reduce.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename VT>
+struct MinReduction {
+  T operator()(const T& a, const T& b) {
+    return std::min(a, b);
+  }
+
+  VT operator()(VT a, VT b) {
+    return simd_min(a, b);
+  }
+};
+
+template <typename T, typename VT>
+struct MaxReduction {
+  T operator()(const T& a, const T& b) {
+    return std::max(a, b);
+  }
+
+  VT operator()(VT a, VT b) {
+    return simd_max(a, b);
+  }
+};
+
+template <typename T, typename VT>
+struct SumReduction {
+  T operator()(const T& a, const T& b) {
+    return a + b;
+  }
+
+  VT operator()(VT a, VT b) {
+    return a + b;
+  }
+};
+
+template <typename T, typename VT, int N, typename Reduction>
+struct StridedReduce {
+  void operator()(const T* x, T* accum, int size, size_t stride) {
+    Reduction op;
+
+    for (int i = 0; i < size; i++) {
+      size_t s = stride;
+      T* a = accum;
+      while (s >= N) {
+        *(VT*)a = op((*(VT*)x), (*(VT*)a));
+        x += N;
+        a += N;
+        s -= N;
+      }
+      while (s-- > 0) {
+        *a = op(*a, *x);
+        a++;
+        x++;
+      }
+    }
+  }
+};
+
+} // namespace
+
+void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  if (in.dtype() == float32) {
+    if (reduce_type_ == Reduce::Sum) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          0,
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              SumReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float acc;
+            vDSP_sve((const float*)x, 1, &acc, size);
+            (*accum) += acc;
+          },
+          [](auto* accum, auto x) { *accum += x; });
+      return;
+    } else if (reduce_type_ == Reduce::Max) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          -std::numeric_limits<float>::infinity(),
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              MaxReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float max;
+            vDSP_maxv((const float*)x, 1, &max, size);
+            (*accum) = (*accum < max) ? max : *accum;
+          },
+          [](auto* accum, auto x) { (*accum) = (*accum < x) ? x : *accum; });
+      return;
+    } else if (reduce_type_ == Reduce::Min) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          std::numeric_limits<float>::infinity(),
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              MinReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float min;
+            vDSP_minv((const float*)x, 1, &min, size);
+            (*accum) = (*accum > min) ? min : *accum;
+          },
+          [](auto* accum, auto x) { (*accum) = (*accum > x) ? x : *accum; });
+      return;
+    }
+  }
+  // TODO: Add integer addition and min/max using the templates above and
+  //       simd_int16 and friends.
+  eval(inputs, out);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -0,0 +1,393 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+#include <limits>
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif
+
+#include <simd/math.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+/**
+ * Compute exp(x) in an optimizer friendly way as follows:
+ *
+ * First change the problem to computing 2**y where y = x / ln(2).
+ *
+ * Now we will compute 2**y as 2**y1 * 2**y2 where y1 is the integer part
+ * `ipart` and y2 is fractional part. For the integer part we perform bit
+ * shifting and for the fractional part we use a polynomial approximation.
+ *
+ * The algorithm and constants of the polynomial taken from
+ * https://github.com/akohlmey/fastermath/blob/master/src/exp.c which took them
+ * from Cephes math library.
+ *
+ * Note: The implementation below is a general fast exp. There could be faster
+ *       implementations for numbers strictly < 0.
+ */
+inline simd_float16 simd_fast_exp(simd_float16 x_init) {
+  auto x = x_init * 1.442695; // multiply with log_2(e)
+  simd_float16 ipart, fpart;
+  simd_int16 epart;
+  x = simd_clamp(x, -80, 80);
+  ipart = simd::floor(x + 0.5);
+  fpart = x - ipart;
+
+  x = 1.535336188319500e-4f;
+  x = x * fpart + 1.339887440266574e-3f;
+  x = x * fpart + 9.618437357674640e-3f;
+  x = x * fpart + 5.550332471162809e-2f;
+  x = x * fpart + 2.402264791363012e-1f;
+  x = x * fpart + 6.931472028550421e-1f;
+  x = x * fpart + 1.000000000000000f;
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  epart = (simd_int(ipart) + 127) << 23;
+
+  // Avoid supressing NaNs
+  simd_int16 eq = (x_init == x_init);
+  return simd_bitselect(x_init, (*(simd_float16*)&epart) * x, eq);
+}
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/**
+ * The ARM neon equivalent of the fast exp above.
+ */
+inline float16x8_t neon_fast_exp(float16x8_t x) {
+  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
+  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
+  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14
+
+  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
+  float16x8_t fpart = vsubq_f16(x, ipart);
+
+  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  int16x8_t epart = vcvtq_s16_f16(ipart);
+  epart = vaddq_s16(epart, vdupq_n_s16(15));
+  epart = vshlq_n_s16(epart, 10);
+
+  return vmulq_f16(vreinterpretq_f16_s16(epart), x);
+}
+
+/**
+ * Implementation of folding maximum for ARM neon. This should possibly be
+ * refactored out of softmax.cpp at some point.
+ */
+inline float16_t neon_reduce_max(float16x8_t x) {
+  float16x4_t y;
+  y = vpmax_f16(vget_low_f16(x), vget_high_f16(x));
+  y = vpmax_f16(y, y);
+  y = vpmax_f16(y, y);
+  return vget_lane_f16(y, 0);
+}
+
+/**
+ * Implementation of folding sum for ARM neon. This should possibly be
+ * refactored out of softmax.cpp at some point.
+ */
+inline float16_t neon_reduce_add(float16x8_t x) {
+  float16x4_t y;
+  float16x4_t zero = vdup_n_f16(0);
+  y = vpadd_f16(vget_low_f16(x), vget_high_f16(x));
+  y = vpadd_f16(y, zero);
+  y = vpadd_f16(y, zero);
+  return vget_lane_f16(y, 0);
+}
+
+template <typename T, typename VT>
+struct NeonFp16SimdOps {
+  VT init(T a) {
+    return vdupq_n_f16(a);
+  }
+
+  VT load(const T* a) {
+    return vld1q_f16(a);
+  }
+
+  void store(T* dst, VT x) {
+    vst1q_f16(dst, x);
+  }
+
+  VT max(VT a, VT b) {
+    return vmaxq_f16(a, b);
+  }
+
+  VT exp(VT x) {
+    return neon_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return vaddq_f16(a, b);
+  }
+
+  VT sub(VT a, T b) {
+    return vsubq_f16(a, vdupq_n_f16(b));
+  }
+
+  VT mul(VT a, VT b) {
+    return vmulq_f16(a, b);
+  }
+
+  VT mul(VT a, T b) {
+    return vmulq_f16(a, vdupq_n_f16(b));
+  }
+
+  T reduce_max(VT x) {
+    return neon_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return neon_reduce_add(x);
+  }
+};
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T, typename VT>
+struct AccelerateSimdOps {
+  VT init(T a) {
+    return a;
+  }
+
+  VT load(const T* a) {
+    return *(VT*)a;
+  }
+
+  void store(T* dst, VT x) {
+    *(VT*)dst = x;
+  }
+
+  VT max(VT a, VT b) {
+    return simd_max(a, b);
+  }
+
+  VT exp(VT x) {
+    return simd_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return a + b;
+  }
+
+  VT sub(VT a, T b) {
+    return a - b;
+  }
+
+  VT mul(VT a, VT b) {
+    return a * b;
+  }
+
+  VT mul(VT a, T b) {
+    return a * b;
+  }
+
+  T reduce_max(VT x) {
+    return simd_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return simd_reduce_add(x);
+  }
+};
+
+template <typename T, typename AccT, typename VT, typename Ops, int N>
+void softmax(const array& in, array& out) {
+  Ops ops;
+
+  const T* in_ptr = in.data<T>();
+  T* out_ptr = out.data<T>();
+  int M = in.shape().back();
+  int L = in.data_size() / M;
+  const T* current_in_ptr;
+  T* current_out_ptr;
+
+  for (int i = 0; i < L; i++, in_ptr += M, out_ptr += M) {
+    // Find the maximum
+    current_in_ptr = in_ptr;
+    VT vmaximum = ops.init(-std::numeric_limits<float>::infinity());
+    size_t s = M;
+    while (s >= N) {
+      VT vals;
+      if constexpr (std::is_same<T, AccT>::value) {
+        vals = ops.load(current_in_ptr);
+      } else {
+        for (int i = 0; i < N; ++i) {
+          vals[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+      }
+      vmaximum = ops.max(vals, vmaximum);
+      current_in_ptr += N;
+      s -= N;
+    }
+    AccT maximum = ops.reduce_max(vmaximum);
+    while (s-- > 0) {
+      maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
+      current_in_ptr++;
+    }
+
+    // Compute the normalizer and the exponentials
+    VT vnormalizer = ops.init(0.0);
+    current_out_ptr = out_ptr;
+    current_in_ptr = in_ptr;
+    s = M;
+    while (s >= N) {
+      VT vexp;
+      if constexpr (std::is_same<T, AccT>::value) {
+        vexp = ops.load(current_in_ptr);
+      } else {
+        for (int i = 0; i < N; ++i) {
+          vexp[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+      }
+      vexp = ops.exp(ops.sub(vexp, maximum));
+      if constexpr (std::is_same<T, AccT>::value) {
+        ops.store(current_out_ptr, vexp);
+      }
+      vnormalizer = ops.add(vnormalizer, vexp);
+      current_in_ptr += N;
+      current_out_ptr += N;
+      s -= N;
+    }
+    AccT normalizer = ops.reduce_add(vnormalizer);
+    while (s-- > 0) {
+      AccT _exp = std::exp(*current_in_ptr - maximum);
+      if (std::is_same<T, AccT>::value) {
+        *current_out_ptr = _exp;
+      }
+      normalizer += _exp;
+      current_in_ptr++;
+      current_out_ptr++;
+    }
+    normalizer = 1 / normalizer;
+
+    // Normalize
+    current_out_ptr = out_ptr;
+    current_in_ptr = in_ptr;
+    s = M;
+    while (s >= N) {
+      if constexpr (std::is_same<T, AccT>::value) {
+        ops.store(current_out_ptr, ops.mul(*(VT*)current_out_ptr, normalizer));
+      } else {
+        VT vexp;
+        for (int i = 0; i < N; ++i) {
+          vexp[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+        vexp = ops.mul(ops.exp(ops.sub(vexp, maximum)), normalizer);
+        for (int i = 0; i < N; ++i) {
+          current_out_ptr[i] = vexp[i];
+        }
+        current_in_ptr += N;
+      }
+      current_out_ptr += N;
+      s -= N;
+    }
+    while (s-- > 0) {
+      if constexpr (std::is_same<T, AccT>::value) {
+        *current_out_ptr *= normalizer;
+      } else {
+        AccT _exp = std::exp(*current_in_ptr - maximum);
+        *current_out_ptr = static_cast<T>(_exp * normalizer);
+        current_in_ptr++;
+      }
+      current_out_ptr++;
+    }
+  }
+}
+
+} // namespace
+
+void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  // Make sure that the last dimension is contiguous
+  auto check_input = [](array x) {
+    bool no_copy = x.strides()[x.ndim() - 1] == 1;
+    if (x.ndim() > 1) {
+      auto s = x.strides()[x.ndim() - 2];
+      no_copy &= (s == 0 || s == x.shape().back());
+    }
+    if (no_copy) {
+      return x;
+    } else {
+      array x_copy(x.shape(), x.dtype(), nullptr, {});
+      copy(x, x_copy, CopyType::General);
+      return x_copy;
+    }
+  };
+  array in = check_input(std::move(inputs[0]));
+  out.set_data(
+      allocator::malloc_or_wait(in.data_size() * in.itemsize()),
+      in.data_size(),
+      in.strides(),
+      in.flags());
+
+  switch (in.dtype()) {
+    case bool_:
+    case uint8:
+    case uint16:
+    case uint32:
+    case uint64:
+    case int8:
+    case int16:
+    case int32:
+    case int64:
+      throw std::invalid_argument(
+          "Softmax is defined only for floating point types");
+      break;
+    case float32:
+      softmax<
+          float,
+          float,
+          simd_float16,
+          AccelerateSimdOps<float, simd_float16>,
+          16>(in, out);
+      break;
+    case float16:
+      if (precise_) {
+        softmax<
+            float16_t,
+            float,
+            simd_float16,
+            AccelerateSimdOps<float, simd_float16>,
+            16>(in, out);
+      } else {
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        softmax<
+            float16_t,
+            float16_t,
+            float16x8_t,
+            NeonFp16SimdOps<float16_t, float16x8_t>,
+            8>(in, out);
+#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        eval(inputs, out); // Redirect to common backend for consistency
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+      }
+      break;
+    case bfloat16:
+      eval(inputs, out);
+      break;
+    case complex64:
+      eval(inputs, out);
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@@ -0,0 +1,28 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <Accelerate/Accelerate.h>
+#include "mlx/dtype.h"
+
+namespace mlx::core {
+
+BNNSDataType to_bnns_dtype(Dtype mlx_dtype) {
+  uint32_t size_bits = size_of(mlx_dtype) * 8;
+  switch (kindof(mlx_dtype)) {
+    case Dtype::Kind::b:
+      return BNNSDataTypeBoolean;
+    case Dtype::Kind::u:
+      return BNNSDataType(BNNSDataTypeUIntBit | size_bits);
+    case Dtype::Kind::i:
+      return BNNSDataType(BNNSDataTypeIntBit | size_bits);
+    case Dtype::Kind::f:
+      return BNNSDataType(BNNSDataTypeFloatBit | size_bits);
+    case Dtype::Kind::V:
+      return BNNSDataTypeBFloat16;
+    case Dtype::Kind::c:
+      throw std::invalid_argument("BNNS does not support complex types");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -1,8 +1,70 @@
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  set(COMPILER ${CMAKE_C_COMPILER})
+  set(CLANG TRUE)
+else()
+  set(COMPILER ${CMAKE_CXX_COMPILER})
+endif()
+
+if(MSVC)
+  set(SHELL_EXT ps1)
+  set(SHELL_CMD powershell -ExecutionPolicy Bypass -File)
+else()
+  set(SHELL_EXT sh)
+  set(SHELL_CMD /bin/bash)
+endif()
+
+add_custom_command(
+  OUTPUT compiled_preamble.cpp
+  COMMAND
+    ${SHELL_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.${SHELL_EXT}
+    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
+    ${PROJECT_SOURCE_DIR} ${CLANG} ${CMAKE_SYSTEM_PROCESSOR}
+  DEPENDS make_compiled_preamble.${SHELL_EXT}
+          compiled_preamble.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
+          ops.h)
+
+add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)
+
+add_dependencies(mlx cpu_compiled_preamble)
+
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)
+
+if(IOS)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_nocpu.cpp)
+else()
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp)
+endif()
--- a/mlx/backend/common/arange.h
+++ b/mlx/backend/common/arange.h
--- a/mlx/backend/common/arg_reduce.cpp
+++ b/mlx/backend/common/arg_reduce.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

-#include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"
+#include "utils.h"

 namespace mlx::core {

@@ -61,7 +61,7 @@ void arg_reduce_dispatch(

 } // namespace

-void ArgReduce::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgReduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -5,9 +5,9 @@
 #include <sstream>

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/binary.h"
-#include "mlx/backend/cpu/binary_ops.h"
-#include "mlx/backend/cpu/binary_two.h"
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/binary_two.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -15,61 +15,69 @@ namespace mlx::core {

 namespace {

+template <typename T, typename U, typename Op>
+void comparison_op(const array& a, const array& b, array& out, Op op) {
+  DefaultScalarVector<T, U, Op> opsv(op);
+  DefaultVectorScalar<T, U, Op> opvs(op);
+  DefaultVectorVector<T, U, Op> opvv(op);
+  binary_op<T, U>(a, b, out, op, opsv, opvs, opvv);
+}
+
 template <typename Op>
 void comparison_op(const array& a, const array& b, array& out, Op op) {
  switch (a.dtype()) {
    case bool_:
-      binary_op<bool, bool>(a, b, out, op);
+      comparison_op<bool, bool>(a, b, out, op);
      break;
    case uint8:
-      binary_op<uint8_t, bool>(a, b, out, op);
+      comparison_op<uint8_t, bool>(a, b, out, op);
      break;
    case uint16:
-      binary_op<uint16_t, bool>(a, b, out, op);
+      comparison_op<uint16_t, bool>(a, b, out, op);
      break;
    case uint32:
-      binary_op<uint32_t, bool>(a, b, out, op);
+      comparison_op<uint32_t, bool>(a, b, out, op);
      break;
    case uint64:
-      binary_op<uint64_t, bool>(a, b, out, op);
+      comparison_op<uint64_t, bool>(a, b, out, op);
      break;
    case int8:
-      binary_op<int8_t, bool>(a, b, out, op);
+      comparison_op<int8_t, bool>(a, b, out, op);
      break;
    case int16:
-      binary_op<int16_t, bool>(a, b, out, op);
+      comparison_op<int16_t, bool>(a, b, out, op);
      break;
    case int32:
-      binary_op<int32_t, bool>(a, b, out, op);
+      comparison_op<int32_t, bool>(a, b, out, op);
      break;
    case int64:
-      binary_op<int64_t, bool>(a, b, out, op);
+      comparison_op<int64_t, bool>(a, b, out, op);
      break;
    case float16:
-      binary_op<float16_t, bool>(a, b, out, op);
+      comparison_op<float16_t, bool>(a, b, out, op);
      break;
    case float32:
-      binary_op<float, bool>(a, b, out, op);
+      comparison_op<float, bool>(a, b, out, op);
      break;
    case bfloat16:
-      binary_op<bfloat16_t, bool>(a, b, out, op);
+      comparison_op<bfloat16_t, bool>(a, b, out, op);
      break;
    case complex64:
-      binary_op<complex64_t, bool>(a, b, out, op);
+      comparison_op<complex64_t, bool>(a, b, out, op);
      break;
  }
 }

 } // namespace

-void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Add::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Add());
 }

-void DivMod::eval_cpu(
+void DivMod::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 2);
@@ -124,68 +132,50 @@ void DivMod::eval_cpu(
  }
 }

-void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Divide::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Divide());
 }

-void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Remainder::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Remainder());
 }

-void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Equal::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
  if (equal_nan_) {
-    switch (a.dtype()) {
-      case float16:
-        binary_op<float16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case float32:
-        binary_op<float, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case complex64:
-        binary_op<complex64_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      default:
-        throw std::runtime_error(
-            "[NanEqual::eval_cpu] Only for floating point types.");
-    }
+    comparison_op(inputs[0], inputs[1], out, detail::NaNEqual());
  } else {
-    comparison_op(a, b, out, detail::Equal());
+    comparison_op(inputs[0], inputs[1], out, detail::Equal());
  }
 }

-void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Greater::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::Greater());
 }

-void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GreaterEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual());
 }

-void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Less::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::Less());
 }

-void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LessEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::LessEqual());
 }

-void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogAddExp::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -206,54 +196,54 @@ void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalAnd());
 }

-void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalOr());
 }

-void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Maximum());
 }

-void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Minimum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Minimum());
 }

-void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Multiply::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Multiply());
 }

-void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void NotEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::NotEqual());
 }

-void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Power::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Power());
 }

-void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Subtract::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -317,7 +307,7 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArcTan2::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,6 +1,7 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
+#include <cassert>

 #include "mlx/allocator.h"
 #include "mlx/array.h"
@@ -8,6 +9,8 @@

 namespace mlx::core {

+namespace {
+
 enum class BinaryOpType {
  ScalarScalar,
  ScalarVector,
@@ -16,7 +19,7 @@ enum class BinaryOpType {
  General,
 };

-inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
+BinaryOpType get_binary_op_type(const array& a, const array& b) {
  BinaryOpType bopt;
  if (a.data_size() == 1 && b.data_size() == 1) {
    bopt = BinaryOpType::ScalarScalar;
@@ -25,8 +28,8 @@ inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
  } else if (b.data_size() == 1 && a.flags().contiguous) {
    bopt = BinaryOpType::VectorScalar;
  } else if (
-      (a.flags().row_contiguous && b.flags().row_contiguous) ||
-      (a.flags().col_contiguous && b.flags().col_contiguous)) {
+      a.flags().row_contiguous && b.flags().row_contiguous ||
+      a.flags().col_contiguous && b.flags().col_contiguous) {
    bopt = BinaryOpType::VectorVector;
  } else {
    bopt = BinaryOpType::General;
@@ -34,7 +37,7 @@ inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
  return bopt;
 }

-inline void set_binary_op_output_data(
+void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
@@ -119,4 +122,409 @@ inline void set_binary_op_output_data(
  }
 }

+struct UseDefaultBinaryOp {};
+
+template <typename T, typename U, typename Op>
+struct DefaultVectorScalar {
+  Op op;
+
+  DefaultVectorScalar(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    T scalar = *b;
+    while (size-- > 0) {
+      *dst = op(*a, scalar);
+      dst++;
+      a++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultScalarVector {
+  Op op;
+
+  DefaultScalarVector(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    T scalar = *a;
+    while (size-- > 0) {
+      *dst = op(scalar, *b);
+      dst++;
+      b++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultVectorVector {
+  Op op;
+
+  DefaultVectorVector(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    while (size-- > 0) {
+      *dst = op(*a, *b);
+      dst++;
+      a++;
+      b++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op, int D, bool Strided>
+void binary_op_dims(
+    const T* a,
+    const T* b,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];
+
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      binary_op_dims<T, U, Op, D - 1, Strided>(
+          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
+    } else {
+      if constexpr (Strided) {
+        op(a, b, out, stride_out);
+      } else {
+        *out = op(*a, *b);
+      }
+    }
+    out += stride_out;
+    a += stride_a;
+    b += stride_b;
+  }
+}
+
+template <typename T, typename U, bool Strided, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    int dim,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* out_ptr = out.data<U>();
+  switch (dim) {
+    case 1:
+      binary_op_dims<T, U, Op, 1, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      binary_op_dims<T, U, Op, 2, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+    case 3:
+      binary_op_dims<T, U, Op, 3, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+  }
+
+  ContiguousIterator a_it(shape, a_strides, dim - 3);
+  ContiguousIterator b_it(shape, b_strides, dim - 3);
+  auto stride = out_strides[dim - 4];
+  for (int64_t elem = 0; elem < a.size(); elem += stride) {
+    binary_op_dims<T, U, Op, 3, Strided>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        out_strides,
+        dim - 3);
+    a_it.step();
+    b_it.step();
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    typename OpSV,
+    typename OpVS,
+    typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  // The full computation is scalar scalar so call the base op once
+  if (bopt == BinaryOpType::ScalarScalar) {
+    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());
+    return;
+  }
+
+  // The full computation is scalar vector so delegate to the op
+  if (bopt == BinaryOpType::ScalarVector) {
+    opsv(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());
+    return;
+  }
+
+  // The full computation is vector scalar so delegate to the op
+  if (bopt == BinaryOpType::VectorScalar) {
+    opvs(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());
+    return;
+  }
+
+  // The full computation is vector vector so delegate to the op
+  if (bopt == BinaryOpType::VectorVector) {
+    opvv(a.data<T>(), b.data<T>(), out.data<U>(), out.size());
+    return;
+  }
+
+  // General computation so let's try to optimize
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), out.strides()});
+  const auto& a_strides = new_strides[0];
+  const auto& b_strides = new_strides[1];
+  const auto& strides = new_strides[2];
+
+  // Get the left-most dim such that the array is row contiguous after
+  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
+    }
+    return d + 1;
+  };
+  auto a_rc_dim = leftmost_rc_dim(a_strides);
+  auto b_rc_dim = leftmost_rc_dim(b_strides);
+
+  // Get the left-most dim such that the array is a broadcasted "scalar" after
+  auto leftmost_s_dim = [](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == 0; d--) {
+    }
+    return d + 1;
+  };
+  auto a_s_dim = leftmost_s_dim(a_strides);
+  auto b_s_dim = leftmost_s_dim(b_strides);
+
+  auto ndim = new_shape.size();
+
+  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
+  int dim = ndim;
+  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
+    bopt = BinaryOpType::VectorVector;
+    dim = d;
+    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
+    bopt = BinaryOpType::VectorScalar;
+    dim = d;
+    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
+    bopt = BinaryOpType::ScalarVector;
+    dim = d;
+  }
+
+  // Can be sure dim > 0 since otherwise we would have used one of the fully
+  // contiguous methods above. Except for the case that the flags do not
+  // correspond to the underlying contiguity.
+  if (dim == 0 || strides[dim - 1] < 16) {
+    bopt = BinaryOpType::General;
+    dim = ndim;
+  }
+
+  switch (bopt) {
+    case BinaryOpType::VectorVector:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvv, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    case BinaryOpType::VectorScalar:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvs, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    case BinaryOpType::ScalarVector:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opsv, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    default:
+      binary_op_dispatch_dims<T, U, false>(
+          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
+      break;
+  }
+}
+
+template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  // TODO: The following mess of constexpr evaluations can probably be achieved
+  //       with template specializations and overloading. Would it be simpler?
+
+  if constexpr (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
+    if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+      if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
+        binary_op<T, T>(
+            a,
+            b,
+            out,
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            DefaultVectorVector<T, T, Op>(op));
+      } else {
+        // opsv and opvs were UseDefaultBinaryOp
+        binary_op<T, T>(
+            a,
+            b,
+            out,
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            opvv);
+      }
+    } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                             value) {
+      // opsv and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          out,
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opsv was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a, b, out, op, DefaultScalarVector<T, T, Op>(op), opvs, opvv);
+    }
+  } else if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::
+                           value) {
+    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opvs and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          out,
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opvs was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a, b, out, op, opsv, DefaultVectorScalar<T, T, Op>(op), opvv);
+    }
+  } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                           value) {
+    // opvv was UseDefaultBinaryOp
+    binary_op<T, T>(
+        a, b, out, op, opsv, opvs, DefaultVectorVector<T, T, Op>(op));
+  } else {
+    // All ops provided
+    binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);
+  }
+}
+
+template <typename T, typename Op>
+void binary_op(const array& a, const array& b, array& out, Op op) {
+  DefaultScalarVector<T, T, Op> opsv(op);
+  DefaultVectorScalar<T, T, Op> opvs(op);
+  DefaultVectorVector<T, T, Op> opvv(op);
+  binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);
+}
+
+template <typename... Ops>
+void binary(const array& a, const array& b, array& out, Ops... ops) {
+  switch (out.dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, out, ops...);
+      break;
+    case uint8:
+      binary_op<uint8_t>(a, b, out, ops...);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, out, ops...);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, out, ops...);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, out, ops...);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, out, ops...);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, out, ops...);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, out, ops...);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, out, ops...);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, out, ops...);
+      break;
+    case float32:
+      binary_op<float>(a, b, out, ops...);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, out, ops...);
+      break;
+    case complex64:
+      binary_op<complex64_t>(a, b, out, ops...);
+      break;
+  }
+}
+
+} // namespace
+
 } // namespace mlx::core
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -2,8 +2,8 @@

 #pragma once

+#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/binary.h"

 namespace mlx::core {

--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"

@@ -64,7 +64,7 @@ void cholesky_impl(const array& a, array& factor, bool upper) {
  }
 }

-void Cholesky::eval_cpu(const std::vector<array>& inputs, array& output) {
+void Cholesky::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Cholesky::eval] only supports float32.");
  }
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -42,7 +42,9 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  return move_or_copy(in, out, strides_, flags, data_size, offset_);
 }

-void broadcast(const array& in, array& out) {
+void Broadcast::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
@@ -59,14 +61,6 @@ void broadcast(const array& in, array& out) {
  move_or_copy(in, out, strides, flags, in.data_size());
 }

-void Broadcast::eval(const std::vector<array>& inputs, array& out) {
-  broadcast(inputs[0], out);
-}
-
-void BroadcastAxes::eval(const std::vector<array>& inputs, array& out) {
-  broadcast(inputs[0], out);
-}
-
 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  move_or_copy(inputs[0], out);
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -130,7 +130,7 @@ std::string build_lib_name(

 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const Shape& shape) {
+    const std::vector<int>& shape) {
  bool contiguous = true;
  bool all_contig = true;
  bool all_row_contig = true;
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -11,7 +11,9 @@
 namespace mlx::core {

 inline bool is_static_cast(const Primitive& p) {
-  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
+  return (
+      typeid(p) == typeid(Broadcast) || typeid(p) == typeid(Copy) ||
+      typeid(p) == typeid(StopGradient) || typeid(p) == typeid(AsType));
 }

 std::string build_lib_name(
@@ -54,7 +56,7 @@ inline bool is_scalar(const array& x) {
 // Check if we can use a contiguous operation given inputs and the output shape
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const Shape& shape);
+    const std::vector<int>& shape);

 // Allocate space for the outputs possibly with input donation
 void compiled_allocate_outputs(
--- a/mlx/backend/common/compiled_cpu.cpp
+++ b/mlx/backend/common/compiled_cpu.cpp
@@ -7,11 +7,8 @@
 #include <mutex>
 #include <shared_mutex>

-#include <fmt/format.h>
-
 #include "mlx/backend/common/compiled.h"
-#include "mlx/backend/cpu/compiled_preamble.h"
-#include "mlx/backend/cpu/jit_compiler.h"
+#include "mlx/backend/common/compiled_preamble.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"

@@ -47,9 +44,12 @@ namespace detail {
 bool compile_available_for_device(const Device& device) {
  return true;
 }
-
 } // namespace detail

+std::string get_temp_file(const std::string& name) {
+  return std::filesystem::temp_directory_path().append(name).string();
+}
+
 // Return a pointer to a compiled function
 void* compile(
    const std::string& kernel_name,
@@ -68,30 +68,24 @@ void* compile(
  std::string source_code = source_builder();
  std::string kernel_file_name;

-  // Deal with long kernel names. Maximum length for filename on macOS is 255
-  // characters, and on Windows the maximum length for whole path is 260. Clip
-  // file name with a little extra room and append a 16 character hash.
-#ifdef _WIN32
-  constexpr int max_file_name_length = 140;
-#else
+  // Deal with long kernel names. Maximum length for files on macOS is 255
+  // characters. Clip file name with a little extra room and append a 16
+  // character hash.
  constexpr int max_file_name_length = 245;
-#endif
  if (kernel_name.size() > max_file_name_length) {
    std::ostringstream file_name;
    file_name
        << std::string_view(kernel_name).substr(0, max_file_name_length - 16);
-    auto file_id =
-        std::hash<std::string>{}(kernel_name.substr(max_file_name_length - 16));
+    auto file_id = std::hash<std::string>{}(kernel_name);
    file_name << "_" << std::hex << std::setw(16) << file_id << std::dec;
    kernel_file_name = file_name.str();
  } else {
    kernel_file_name = kernel_name;
  }

-  auto output_dir = std::filesystem::temp_directory_path();
-
-  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
-  auto shared_lib_path = (output_dir / shared_lib_name).string();
+  std::ostringstream shared_lib_name;
+  shared_lib_name << "lib" << kernel_file_name << ".so";
+  auto shared_lib_path = get_temp_file(shared_lib_name.str());
  bool lib_exists = false;
  {
    std::ifstream f(shared_lib_path.c_str());
@@ -100,21 +94,24 @@ void* compile(

  if (!lib_exists) {
    // Open source file and write source code to it
-    std::string source_file_name = kernel_file_name + ".cpp";
-    auto source_file_path = (output_dir / source_file_name).string();
+    std::ostringstream source_file_name;
+    source_file_name << kernel_file_name << ".cpp";
+    auto source_file_path = get_temp_file(source_file_name.str());

    std::ofstream source_file(source_file_path);
    source_file << source_code;
    source_file.close();

-    try {
-      JitCompiler::exec(JitCompiler::build_command(
-          output_dir, source_file_name, shared_lib_name));
-    } catch (const std::exception& error) {
-      throw std::runtime_error(fmt::format(
-          "[Compile::eval_cpu] Failed to compile function {0}: {1}",
-          kernel_name,
-          error.what()));
+    std::ostringstream build_command;
+    build_command << "g++ -std=c++17 -O3 -Wall -fPIC -shared '"
+                  << source_file_path << "' -o '" << shared_lib_path << "'";
+    std::string build_command_str = build_command.str();
+    auto return_code = system(build_command_str.c_str());
+    if (return_code) {
+      std::ostringstream msg;
+      msg << "[Compile::eval_cpu] Failed to compile function " << kernel_name
+          << " with error code " << return_code << "." << std::endl;
+      throw std::runtime_error(msg.str());
    }
  }

@@ -154,11 +151,6 @@ inline void build_kernel(

  NodeNamer namer;

-#ifdef _MSC_VER
-  // Export the symbol
-  os << "__declspec(dllexport) ";
-#endif
-
  // Start the kernel
  os << "void " << kernel_name << "(void** args) {" << std::endl;

--- a/mlx/backend/common/compiled_nocpu.cpp
+++ b/mlx/backend/common/compiled_nocpu.cpp
@@ -1,7 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.

-#include "mlx/compile_impl.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/compiled.h"

 namespace mlx::core {

--- a/mlx/backend/common/compiled_preamble.h
+++ b/mlx/backend/common/compiled_preamble.h
@@ -5,8 +5,7 @@
 // clang-format off
 #include "mlx/types/half_types.h"
 #include "mlx/types/complex.h"
-#include "mlx/backend/cpu/unary_ops.h"
-#include "mlx/backend/cpu/binary_ops.h"
+#include "mlx/backend/common/ops.h"
 // clang-format on

 const char* get_kernel_preamble();
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -3,8 +3,8 @@
 #include <cassert>
 #include <numeric>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -726,7 +726,7 @@ void explicit_gemm_conv_1D_cpu(
  auto conv_dtype = float32;

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], C};
+  std::vector<int> padded_shape = {N, iH + 2 * padding[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -765,7 +765,7 @@ void explicit_gemm_conv_1D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  Shape strided_reshape = {N * oH, wH * C};
+  std::vector<int> strided_reshape = {N * oH, wH * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -843,7 +843,8 @@ void explicit_gemm_conv_2D_cpu(
  auto conv_dtype = out.dtype();

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], iW + 2 * padding[1], C};
+  std::vector<int> padded_shape = {
+      N, iH + 2 * padding[0], iW + 2 * padding[1], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -880,7 +881,7 @@ void explicit_gemm_conv_2D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  Shape strided_reshape = {N * oH * oW, wH * wW * C};
+  std::vector<int> strided_reshape = {N * oH * oW, wH * wW * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -933,19 +934,19 @@ void explicit_gemm_conv_ND_cpu(
    const std::vector<int>& wt_dilation,
    const bool flip) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const auto iDim =
-      Shape(in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
-  const auto oDim = Shape(
+  const auto iDim = std::vector<int>(
+      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
+  const auto oDim = std::vector<int>(
      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(-1); // In channels
-  const auto wDim =
-      Shape(wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
+  const auto wDim = std::vector<int>(
+      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim

  auto conv_dtype = float32;

  // Pad input
-  Shape padded_shape(in.shape().size());
+  std::vector<int> padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
@@ -1128,7 +1129,7 @@ void conv_3D_cpu(

 } // namespace

-void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Convolution::eval(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto& in = inputs[0];
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -3,9 +3,8 @@
 #include <numeric>

 #include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"

 namespace mlx::core {

@@ -24,7 +23,6 @@ template <typename SrcT, typename DstT>
 void copy_vector(const array& src, array& dst) {
  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
-  size_t size = src.data_size();
  std::copy(src_ptr, src_ptr + src.data_size(), dst_ptr);
 }

--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -3,6 +3,7 @@
 #pragma once

 #include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -22,4 +23,17 @@ enum class CopyType {
  GeneralGeneral
 };

+void copy(const array& src, array& dst, CopyType ctype);
+void copy_inplace(const array& src, array& dst, CopyType ctype);
+
+void copy_inplace(
+    const array& src,
+    array& dst,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
+    int64_t i_offset,
+    int64_t o_offset,
+    CopyType ctype);
+
 } // namespace mlx::core
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -0,0 +1,197 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cstring>
+
+#include "mlx/array.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+
+#define DEFAULT(primitive)                                                 \
+  void primitive::eval_cpu(const std::vector<array>& inputs, array& out) { \
+    primitive::eval(inputs, out);                                          \
+  }
+
+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
+namespace mlx::core {
+
+DEFAULT(Abs)
+DEFAULT(Add)
+DEFAULT(Arange)
+DEFAULT(ArcCos)
+DEFAULT(ArcCosh)
+DEFAULT(ArcSin)
+DEFAULT(ArcSinh)
+DEFAULT(ArcTan)
+DEFAULT(ArcTan2)
+DEFAULT(ArcTanh)
+DEFAULT(ArgPartition)
+DEFAULT(ArgReduce)
+DEFAULT(ArgSort)
+DEFAULT(AsType)
+DEFAULT(AsStrided)
+DEFAULT(Broadcast)
+DEFAULT(BlockMaskedMM)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
+DEFAULT_MULTI(DivMod)
+DEFAULT(Ceil)
+DEFAULT(Concatenate)
+DEFAULT(Conjugate)
+DEFAULT(Convolution)
+DEFAULT(Copy)
+DEFAULT(Cos)
+DEFAULT(Cosh)
+DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(Depends)
+DEFAULT(Divide)
+DEFAULT(NumberOfElements)
+DEFAULT(Remainder)
+DEFAULT(Equal)
+DEFAULT(Erf)
+DEFAULT(ErfInv)
+DEFAULT(Exp)
+DEFAULT(ExpandDims)
+DEFAULT(Expm1)
+DEFAULT(FFT)
+DEFAULT(Floor)
+DEFAULT(Full)
+DEFAULT(Gather)
+DEFAULT(Greater)
+DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
+DEFAULT(Less)
+DEFAULT(LessEqual)
+DEFAULT(Load)
+DEFAULT(Log)
+DEFAULT(Log1p)
+DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
+DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
+DEFAULT(Multiply)
+DEFAULT(Negative)
+DEFAULT(NotEqual)
+DEFAULT(Pad)
+DEFAULT(Partition)
+DEFAULT(Power)
+DEFAULT_MULTI(QRF)
+DEFAULT(QuantizedMatmul)
+DEFAULT(RandomBits)
+DEFAULT(Reduce)
+DEFAULT(Round)
+DEFAULT(Scan)
+DEFAULT(Scatter)
+DEFAULT(Select)
+DEFAULT(Sigmoid)
+DEFAULT(Sign)
+DEFAULT(Sin)
+DEFAULT(Sinh)
+DEFAULT(Slice)
+DEFAULT(SliceUpdate)
+DEFAULT(Softmax)
+DEFAULT(Sort)
+DEFAULT_MULTI(Split)
+DEFAULT(Square)
+DEFAULT(Squeeze)
+DEFAULT(Sqrt)
+DEFAULT(StopGradient)
+DEFAULT(Subtract)
+DEFAULT_MULTI(SVD)
+DEFAULT(Tan)
+DEFAULT(Tanh)
+DEFAULT(Transpose)
+DEFAULT(Inverse)
+DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)
+
+namespace {
+
+inline void matmul_common_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  auto check_transpose = [](const array& arr) {
+    auto stx = arr.strides()[arr.ndim() - 2];
+    auto sty = arr.strides()[arr.ndim() - 1];
+    if (stx == arr.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, arr);
+    } else if (stx == 1 && sty == arr.shape(-2)) {
+      return std::make_tuple(true, sty, arr);
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      stx = arr.shape(-1);
+      return std::make_tuple(false, stx, arr_copy);
+    }
+  };
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    cblas_sgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        alpha, // alpha
+        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        lda,
+        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        ldb,
+        beta, // beta
+        out.data<float>() + M * N * i,
+        out.shape(-1) // ldc
+    );
+  }
+}
+
+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[Matmul::eval_cpu] Currently only supports float32.");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_common_general(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[AddMM::eval_cpu] Currently only supports float32.");
+  }
+
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  return matmul_common_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/eigh.cpp
+++ b/mlx/backend/common/eigh.cpp
@@ -2,8 +2,8 @@

 #include "mlx/allocator.h"
 #include "mlx/array.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"

@@ -45,9 +45,7 @@ void ssyevd(

 } // namespace

-void Eigh::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void Eigh::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  const auto& a = inputs[0];
  auto& values = outputs[0];

--- a/mlx/backend/common/erf.cpp
+++ b/mlx/backend/common/erf.cpp
@@ -0,0 +1,40 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cmath>
+
+namespace mlx::core {
+
+/* Approximation to the inverse error function.
+ * Based on code from:
+ *   https://stackoverflow.com/questions/27229371/inverse-error-function-in-c#answer-49743348
+ */
+float erfinv(float a) {
+  auto t = std::fma(a, 0.0f - a, 1.0f);
+  t = std::log(t);
+  float p;
+  if (std::abs(t) > 6.125f) { // maximum ulp error = 2.35793
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = std::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = std::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = std::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = std::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = std::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = std::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = std::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    p = std::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  } else { // maximum ulp error = 2.35002
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = std::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = std::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = std::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = std::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = std::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = std::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = std::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = std::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    p = std::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  }
+  return a * p;
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/fft.cpp
+++ b/mlx/backend/common/fft.cpp
@@ -8,7 +8,7 @@

 namespace mlx::core {

-void FFT::eval_cpu(const std::vector<array>& inputs, array& out) {
+void FFT::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  std::vector<std::ptrdiff_t> strides_in(
      in.strides().begin(), in.strides().end());
--- a/mlx/backend/common/hadamard.cpp
+++ b/mlx/backend/common/hadamard.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/hadamard.h"
-#include "mlx/backend/cpu/copy.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -82,7 +82,7 @@ void hadamard(array& out, int n, int m, float scale) {
  }
 }

-void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Hadamard::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -104,4 +104,4 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/indexing.cpp
+++ b/mlx/backend/common/indexing.cpp
@@ -6,8 +6,8 @@
 #include "mlx/allocator.h"
 #include "mlx/primitives.h"

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"

 namespace mlx::core {

@@ -16,6 +16,11 @@ inline size_t offset_neg_idx(IdxT idx, size_t size) {
  return (idx < 0) ? idx + size : idx;
 }

+template <>
+inline size_t offset_neg_idx(bool idx, size_t) {
+  return idx;
+}
+
 template <>
 inline size_t offset_neg_idx(uint32_t idx, size_t) {
  return idx;
@@ -157,18 +162,21 @@ void dispatch_gather(
  }
 }

-void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Gather::eval(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto& src = inputs[0];
  std::vector<array> inds(inputs.begin() + 1, inputs.end());

  if (inds.empty()) {
-    dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
+    dispatch_gather<bool>(src, inds, out, axes_, slice_sizes_);
    return;
  }

  switch (inds[0].dtype()) {
+    case bool_:
+      dispatch_gather<bool>(src, inds, out, axes_, slice_sizes_);
+      break;
    case uint8:
      dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
      break;
@@ -193,142 +201,12 @@ void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
    case int64:
      dispatch_gather<int64_t>(src, inds, out, axes_, slice_sizes_);
      break;
-    default:
-      throw std::runtime_error(
-          "[Gather::eval_cpu] Cannot gather with indices type.");
-      break;
-  }
-}
-template <typename T, typename IdxT>
-void gather_axis(
-    const array& src,
-    const array& ind,
-    array& out,
-    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
-
-  auto ind_ptr = ind.data<IdxT>();
-  auto src_ptr = src.data<T>();
-  auto dst_ptr = out.data<T>();
-  auto ind_ax_stride = ind.strides(axis);
-  auto src_ax_stride = src.strides(axis);
-  auto dst_ax_stride = out.strides(axis);
-  auto ind_ax_size = ind.shape(axis);
-  auto src_ax_size = src.shape(axis);
-
-  size_t size_pre = 1;
-  size_t size_post = 1;
-  for (int i = 0; i < axis; ++i) {
-    size_pre *= ind.shape(i);
-  }
-  for (int i = axis + 1; i < ind.ndim(); ++i) {
-    size_post *= ind.shape(i);
-  }
-  size_t stride_pre = size_post * ind_ax_size;
-  for (size_t i = 0; i < size_pre; i++) {
-    for (size_t k = 0; k < size_post; k++) {
-      for (int j = 0; j < ind_ax_size; ++j) {
-        auto ind_val = offset_neg_idx(
-            ind_ptr[ind_it.loc + j * ind_ax_stride], src_ax_size);
-        dst_ptr[k + j * dst_ax_stride] =
-            src_ptr[src_it.loc + ind_val * src_ax_stride];
-      }
-      ind_it.step();
-      src_it.step();
-    }
-    dst_ptr += stride_pre;
-  }
-}
-
-template <typename IdxT>
-void dispatch_gather_axis(
-    const array& src,
-    const array& inds,
-    array& out,
-    const int axis) {
-  switch (out.dtype()) {
-    case bool_:
-      gather_axis<bool, IdxT>(src, inds, out, axis);
-      break;
-    case uint8:
-      gather_axis<uint8_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint16:
-      gather_axis<uint16_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint32:
-      gather_axis<uint32_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint64:
-      gather_axis<uint64_t, IdxT>(src, inds, out, axis);
-      break;
-    case int8:
-      gather_axis<int8_t, IdxT>(src, inds, out, axis);
-      break;
-    case int16:
-      gather_axis<int16_t, IdxT>(src, inds, out, axis);
-      break;
-    case int32:
-      gather_axis<int32_t, IdxT>(src, inds, out, axis);
-      break;
-    case int64:
-      gather_axis<int64_t, IdxT>(src, inds, out, axis);
-      break;
    case float16:
-      gather_axis<float16_t, IdxT>(src, inds, out, axis);
-      break;
    case float32:
-      gather_axis<float, IdxT>(src, inds, out, axis);
-      break;
    case bfloat16:
-      gather_axis<bfloat16_t, IdxT>(src, inds, out, axis);
-      break;
    case complex64:
-      gather_axis<complex64_t, IdxT>(src, inds, out, axis);
-      break;
-  }
-}
-
-void GatherAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  auto& src = inputs[0];
-  auto& inds = inputs[1];
-  switch (inds.dtype()) {
-    case uint8:
-      dispatch_gather_axis<uint8_t>(src, inds, out, axis_);
-      break;
-    case uint16:
-      dispatch_gather_axis<uint16_t>(src, inds, out, axis_);
-      break;
-    case uint32:
-      dispatch_gather_axis<uint32_t>(src, inds, out, axis_);
-      break;
-    case uint64:
-      dispatch_gather_axis<uint64_t>(src, inds, out, axis_);
-      break;
-    case int8:
-      dispatch_gather_axis<int8_t>(src, inds, out, axis_);
-      break;
-    case int16:
-      dispatch_gather_axis<int16_t>(src, inds, out, axis_);
-      break;
-    case int32:
-      dispatch_gather_axis<int32_t>(src, inds, out, axis_);
-      break;
-    case int64:
-      dispatch_gather_axis<int64_t>(src, inds, out, axis_);
-      break;
-    default:
      throw std::runtime_error(
-          "[GatherAxis::eval_cpu] Cannot gather with indices type.");
+          "[Gather::eval] Cannot gather with floating point indices.");
      break;
  }
 }
@@ -418,11 +296,14 @@ void dispatch_scatter(
    const std::vector<int>& axes,
    Scatter::ReduceType rtype) {
  if (inds.empty()) {
-    dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
+    dispatch_scatter_inds<InT, bool>(out, inds, updates, axes, rtype);
    return;
  }

  switch (inds[0].dtype()) {
+    case bool_:
+      dispatch_scatter_inds<InT, bool>(out, inds, updates, axes, rtype);
+      break;
    case uint8:
      dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
      break;
@@ -447,13 +328,16 @@ void dispatch_scatter(
    case int64:
      dispatch_scatter_inds<InT, int64_t>(out, inds, updates, axes, rtype);
      break;
-    default:
+    case float16:
+    case float32:
+    case bfloat16:
+    case complex64:
      throw std::runtime_error(
-          "[Scatter::eval_cpu] Cannot scatter with indices type.");
+          "[Scatter::eval_cpu] Cannot scatter with floating point indices.");
  }
 }

-void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Scatter::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() >= 2);

  auto& src = inputs[0];
@@ -461,9 +345,7 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& updates = inputs.back();

  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
+  copy(src, out, CopyType::General);

  switch (src.dtype()) {
    case bool_:
@@ -508,167 +390,4 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-template <typename T, typename IdxT, typename OpT>
-void scatter_axis(
-    array& out,
-    const array idx,
-    const array& upd,
-    int axis,
-    const OpT& op) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
-
-  auto idx_ptr = idx.data<IdxT>();
-  auto upd_ptr = upd.data<T>();
-  auto dst_ptr = out.data<T>();
-  auto idx_ax_stride = idx.strides(axis);
-  auto upd_ax_stride = upd.strides(axis);
-  auto dst_ax_stride = out.strides(axis);
-  auto idx_ax_size = idx.shape(axis);
-  auto dst_ax_size = out.shape(axis);
-
-  size_t size_pre = 1;
-  size_t size_post = 1;
-  for (int i = 0; i < axis; ++i) {
-    size_pre *= idx.shape(i);
-  }
-  for (int i = axis + 1; i < idx.ndim(); ++i) {
-    size_post *= idx.shape(i);
-  }
-  size_t stride_pre = size_post * dst_ax_size;
-  for (size_t i = 0; i < size_pre; i++) {
-    for (size_t k = 0; k < size_post; k++) {
-      for (int j = 0; j < idx_ax_size; ++j) {
-        auto ind_val = offset_neg_idx(
-            idx_ptr[idx_it.loc + j * idx_ax_stride], dst_ax_size);
-        op(upd_ptr[upd_it.loc + j * upd_ax_stride],
-           dst_ptr + k + ind_val * dst_ax_stride);
-      }
-      idx_it.step();
-      upd_it.step();
-    }
-    dst_ptr += stride_pre;
-  }
-}
-
-template <typename InT, typename IdxT>
-void dispatch_scatter_axis_op(
-    array& out,
-    const array& idx,
-    const array& updates,
-    int axis,
-    ScatterAxis::ReduceType rtype) {
-  switch (rtype) {
-    case ScatterAxis::None:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) = x; });
-      break;
-    case ScatterAxis::Sum:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) += x; });
-      break;
-  }
-}
-
-template <typename InT>
-void dispatch_scatter_axis(
-    array& out,
-    const array& idx,
-    const array& updates,
-    int axis,
-    ScatterAxis::ReduceType rtype) {
-  switch (idx.dtype()) {
-    case uint8:
-      dispatch_scatter_axis_op<InT, uint8_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint16:
-      dispatch_scatter_axis_op<InT, uint16_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint32:
-      dispatch_scatter_axis_op<InT, uint32_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint64:
-      dispatch_scatter_axis_op<InT, uint64_t>(out, idx, updates, axis, rtype);
-      break;
-    case int8:
-      dispatch_scatter_axis_op<InT, int8_t>(out, idx, updates, axis, rtype);
-      break;
-    case int16:
-      dispatch_scatter_axis_op<InT, int16_t>(out, idx, updates, axis, rtype);
-      break;
-    case int32:
-      dispatch_scatter_axis_op<InT, int32_t>(out, idx, updates, axis, rtype);
-      break;
-    case int64:
-      dispatch_scatter_axis_op<InT, int64_t>(out, idx, updates, axis, rtype);
-      break;
-    default:
-      throw std::runtime_error(
-          "[ScatterAxis::eval_cpu] Cannot scatter with indices type.");
-  }
-}
-
-void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() >= 2);
-
-  auto& src = inputs[0];
-  auto& idx = inputs[1];
-  auto& updates = inputs[2];
-
-  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
-
-  switch (src.dtype()) {
-    case bool_:
-      dispatch_scatter_axis<bool>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint8:
-      dispatch_scatter_axis<uint8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint16:
-      dispatch_scatter_axis<uint16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint32:
-      dispatch_scatter_axis<uint32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint64:
-      dispatch_scatter_axis<uint64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int8:
-      dispatch_scatter_axis<int8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int16:
-      dispatch_scatter_axis<int16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int32:
-      dispatch_scatter_axis<int32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int64:
-      dispatch_scatter_axis<int64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float16:
-      dispatch_scatter_axis<float16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float32:
-      dispatch_scatter_axis<float>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case bfloat16:
-      dispatch_scatter_axis<bfloat16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case complex64:
-      dispatch_scatter_axis<complex64_t>(
-          out, idx, updates, axis_, reduce_type_);
-      break;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
@@ -110,7 +110,7 @@ void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
  }
 }

-void Inverse::eval_cpu(const std::vector<array>& inputs, array& output) {
+void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
--- a/mlx/backend/common/lapack.h
+++ b/mlx/backend/common/lapack.h
@@ -11,7 +11,7 @@
 #define lapack_complex_double std::complex<double>
 #endif

-#ifdef MLX_USE_ACCELERATE
+#ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
 #include <cblas.h>
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -1,9 +1,12 @@
 // Copyright © 2023 Apple Inc.

 #include <algorithm>
+#include <cassert>
 #include <utility>

+#include "mlx/allocator.h"
 #include "mlx/backend/common/load.h"
+#include "mlx/primitives.h"

 namespace {

@@ -48,4 +51,11 @@ void load(
  }
 }

+void Load::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 0);
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  load(out, offset_, reader_, swap_endianness_);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/make_compiled_preamble.ps1
+++ b/mlx/backend/common/make_compiled_preamble.ps1
@@ -8,12 +8,12 @@ $CL = $args[1]
 $SRCDIR = $args[2]

 # Get command result as array.
-$CONTENT = & $CL /std:c++17 /EP "/I$SRCDIR" /Tp "$SRCDIR/mlx/backend/cpu/compiled_preamble.h"
+$CONTENT = & $CL /std:c++17 /EP "/I$SRCDIR" /Tp "$SRCDIR/mlx/backend/common/compiled_preamble.h"
 # Remove empty lines.
 # Otherwise there will be too much empty lines making the result unreadable.
 $CONTENT = $CONTENT | Where-Object { $_.Trim() -ne '' }
 # Concatenate to string.
-$CONTENT = $CONTENT -join "`n"
+$CONTENT = $CONTENT -join '`n'

 # Append extra content.
 $CONTENT = @"
--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -24,7 +24,7 @@ else
 CC_FLAGS="-std=c++17"
 fi

-CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E "$SRCDIR/mlx/backend/cpu/compiled_preamble.h" 2>/dev/null)
+CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E "$SRCDIR/mlx/backend/common/compiled_preamble.h" 2>/dev/null)

 cat << EOF > "$OUTPUT_FILE"
 const char* get_kernel_preamble() {
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -3,9 +3,9 @@
 #include <cstring>

 #include "mlx/array.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -53,7 +53,7 @@ inline void mask_matrix(

 } // namespace

-void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[BlockMaskedMM::eval] Currently only supports float32.");
@@ -210,7 +210,7 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[GatherMM::eval] Currently only supports float32.");
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -0,0 +1,680 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <complex>
+
+namespace mlx::core::detail {
+
+namespace {
+constexpr float inf = std::numeric_limits<float>::infinity();
+} // namespace
+
+typedef union {
+  int i;
+  float f;
+} IntOrFloat;
+
+inline float fast_exp(float x) {
+  if (x == -std::numeric_limits<float>::infinity()) {
+    return 0.0f;
+  } else if (x == std::numeric_limits<float>::infinity() || std::isnan(x)) {
+    return x;
+  }
+  x *= 1.442695; // multiply with log_2(e)
+  float ipart, fpart;
+  IntOrFloat epart;
+  x = std::max(-80.f, std::min(x, 80.f));
+  ipart = std::floor(x + 0.5);
+  fpart = x - ipart;
+
+  x = 1.535336188319500e-4f;
+  x = x * fpart + 1.339887440266574e-3f;
+  x = x * fpart + 9.618437357674640e-3f;
+  x = x * fpart + 5.550332471162809e-2f;
+  x = x * fpart + 2.402264791363012e-1f;
+  x = x * fpart + 6.931472028550421e-1f;
+  x = x * fpart + 1.000000000000000f;
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  epart.i = (int(ipart) + 127) << 23;
+
+  return epart.f * x;
+}
+
+inline float fast_erf(float a) {
+  float r, s, t, u;
+  t = std::abs(a);
+  s = a * a;
+  if (t > 0.927734375f) {
+    // maximum error 0.99527 ulp
+    r = std::fma(
+        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+    u = std::fma(
+        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+    r = std::fma(r, s, u);
+    r = std::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+    r = std::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+    r = std::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+    r = std::fma(r, t, -t);
+    // TODO, replace with expm1 when implemented
+    r = 1.0f - std::exp(r);
+    r = std::copysign(r, a);
+  } else {
+    // maximum error 0.98929 ulp
+    r = -5.96761703e-4f; // -0x1.38e000p-11
+    r = std::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+    r = std::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+    r = std::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+    r = std::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+    r = std::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+    r = std::fma(r, a, a);
+  }
+  return r;
+}
+
+inline float fast_erfinv(float a) {
+  auto t = std::fma(a, 0.0f - a, 1.0f);
+  t = std::log(t);
+  float p;
+  if (std::abs(t) > 6.125f) { // maximum ulp error = 2.35793
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = std::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = std::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = std::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = std::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = std::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = std::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = std::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    p = std::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  } else { // maximum ulp error = 2.35002
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = std::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = std::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = std::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = std::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = std::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = std::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = std::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = std::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    p = std::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  }
+  return a * p;
+}
+
+struct Abs {
+  template <typename T>
+  T operator()(T x) {
+    return std::abs(x);
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct ArcCos {
+  template <typename T>
+  T operator()(T x) {
+    return std::acos(x);
+  }
+};
+
+struct ArcCosh {
+  template <typename T>
+  T operator()(T x) {
+    return std::acosh(x);
+  }
+};
+
+struct ArcSin {
+  template <typename T>
+  T operator()(T x) {
+    return std::asin(x);
+  }
+};
+
+struct ArcSinh {
+  template <typename T>
+  T operator()(T x) {
+    return std::asinh(x);
+  }
+};
+
+struct ArcTan {
+  template <typename T>
+  T operator()(T x) {
+    return std::atan(x);
+  }
+};
+
+struct ArcTan2 {
+  template <typename T>
+  T operator()(T y, T x) {
+    return std::atan2(y, x);
+  }
+};
+
+struct ArcTanh {
+  template <typename T>
+  T operator()(T x) {
+    return std::atanh(x);
+  }
+};
+
+struct Ceil {
+  template <typename T>
+  T operator()(T x) {
+    return std::ceil(x);
+  }
+  int8_t operator()(int8_t x) {
+    return x;
+  }
+  int16_t operator()(int16_t x) {
+    return x;
+  }
+  int32_t operator()(int32_t x) {
+    return x;
+  }
+  int64_t operator()(int64_t x) {
+    return x;
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct Conjugate {
+  complex64_t operator()(complex64_t x) {
+    return std::conj(x);
+  }
+};
+
+struct Cos {
+  template <typename T>
+  T operator()(T x) {
+    return std::cos(x);
+  }
+};
+
+struct Cosh {
+  template <typename T>
+  T operator()(T x) {
+    return std::cosh(x);
+  }
+};
+
+struct Erf {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<T>(fast_erf(static_cast<float>(x)));
+  }
+};
+
+struct ErfInv {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<T>(fast_erfinv(static_cast<float>(x)));
+  }
+};
+
+struct Exp {
+  template <typename T>
+  T operator()(T x) {
+    return fast_exp(x);
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return std::exp(x);
+  }
+};
+
+struct Expm1 {
+  template <typename T>
+  T operator()(T x) {
+    return expm1(x);
+  }
+};
+
+struct Floor {
+  template <typename T>
+  T operator()(T x) {
+    return std::floor(x);
+  }
+  int8_t operator()(int8_t x) {
+    return x;
+  }
+  int16_t operator()(int16_t x) {
+    return x;
+  }
+  int32_t operator()(int32_t x) {
+    return x;
+  }
+  int64_t operator()(int64_t x) {
+    return x;
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct Imag {
+  template <typename T>
+  T operator()(T x) {
+    return std::imag(x);
+  }
+};
+
+struct Log {
+  template <typename T>
+  T operator()(T x) {
+    return std::log(x);
+  }
+};
+
+struct Log2 {
+  template <typename T>
+  T operator()(T x) {
+    return std::log2(x);
+  }
+};
+
+struct Log10 {
+  template <typename T>
+  T operator()(T x) {
+    return std::log10(x);
+  }
+};
+
+struct Log1p {
+  template <typename T>
+  T operator()(T x) {
+    return log1p(x);
+  }
+};
+
+struct LogicalNot {
+  template <typename T>
+  T operator()(T x) {
+    return !x;
+  }
+};
+
+struct Negative {
+  template <typename T>
+  T operator()(T x) {
+    return -x;
+  }
+};
+
+struct Real {
+  template <typename T>
+  T operator()(T x) {
+    return std::real(x);
+  }
+};
+
+struct Round {
+  template <typename T>
+  T operator()(T x) {
+    return std::rint(x);
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return {std::rint(x.real()), std::rint(x.imag())};
+  }
+};
+
+struct Sigmoid {
+  template <typename T>
+  T operator()(T x) {
+    auto one = static_cast<decltype(x)>(1.0);
+    return one / (one + fast_exp(-x));
+  }
+};
+
+struct Sign {
+  template <typename T>
+  T operator()(T x) {
+    return (x > T(0)) - (x < T(0));
+  }
+  uint8_t operator()(uint8_t x) {
+    return x != 0;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x != 0;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x != 0;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x != 0;
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return x == complex64_t(0) ? x : x / std::abs(x);
+  }
+};
+
+struct Sin {
+  template <typename T>
+  T operator()(T x) {
+    return std::sin(x);
+  }
+};
+
+struct Sinh {
+  template <typename T>
+  T operator()(T x) {
+    return std::sinh(x);
+  }
+};
+
+struct Square {
+  template <typename T>
+  T operator()(T x) {
+    return x * x;
+  }
+};
+
+struct Sqrt {
+  template <typename T>
+  T operator()(T x) {
+    return std::sqrt(x);
+  }
+};
+
+struct Rsqrt {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<decltype(x)>(1.0) / std::sqrt(x);
+  }
+};
+
+struct Tan {
+  template <typename T>
+  T operator()(T x) {
+    return std::tan(x);
+  }
+};
+
+struct Tanh {
+  template <typename T>
+  T operator()(T x) {
+    return std::tanh(x);
+  }
+};
+
+struct Add {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct Divide {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T> & !std::is_signed_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    return numerator % denominator;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T> & std::is_signed_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    auto r = numerator % denominator;
+    if (r != 0 && (r < 0 != denominator < 0))
+      r += denominator;
+    return r;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    auto r = std::fmod(numerator, denominator);
+    if (r != 0 && (r < 0 != denominator < 0)) {
+      r += denominator;
+    }
+    return r;
+  }
+
+  complex64_t operator()(complex64_t numerator, complex64_t denominator) {
+    return numerator % denominator;
+  }
+};
+
+struct Equal {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    if constexpr (std::is_integral_v<T>) {
+      // isnan always returns false for integers, and MSVC refuses to compile.
+      return x == y;
+    } else {
+      return x == y || (std::isnan(x) && std::isnan(y));
+    }
+  }
+};
+
+struct Greater {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct Maximum {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T x, T y) {
+    return (x > y) ? x : y;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T x, T y) {
+    if (std::isnan(x)) {
+      return x;
+    }
+    return (x > y) ? x : y;
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T x, T y) {
+    return x < y ? x : y;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T x, T y) {
+    if (std::isnan(x)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  T operator()(T x, T y) {
+    constexpr float inf = std::numeric_limits<float>::infinity();
+    auto maxval = Maximum()(x, y);
+    auto minval = Minimum()(x, y);
+    return (minval == -inf || maxval == inf)
+        ? maxval
+        : static_cast<decltype(x)>(
+              maxval + std::log1p(fast_exp(minval - maxval)));
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x != y;
+  }
+};
+
+struct Power {
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T base, T exp) {
+    return std::pow(base, exp);
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T base, T exp) {
+    T res = 1;
+    while (exp) {
+      if (exp & 1) {
+        res *= base;
+      }
+      exp >>= 1;
+      base *= base;
+    }
+    return res;
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x && y;
+  }
+};
+
+struct LogicalOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x || y;
+  }
+};
+
+struct Select {
+  template <typename T>
+  T operator()(bool condition, T x, T y) {
+    return condition ? x : y;
+  }
+};
+
+struct BitwiseAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x & y;
+  }
+};
+
+struct BitwiseOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x | y;
+  }
+};
+
+struct BitwiseXor {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x ^ y;
+  }
+};
+
+struct LeftShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x << y;
+  }
+};
+
+struct RightShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x >> y;
+  }
+};
+
+} // namespace mlx::core::detail
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -0,0 +1,638 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <numeric>
+#include <sstream>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/arange.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/ops.h"
+#include "mlx/backend/common/slicing.h"
+#include "mlx/backend/common/threefry.h"
+#include "mlx/backend/common/unary.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+void reshape(const array& in, array& out) {
+  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
+  if (copy_necessary) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    copy_inplace(in, out, CopyType::General);
+  } else {
+    shared_buffer_reshape(in, out_strides, out);
+  }
+}
+
+void Abs::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), unsignedinteger)) {
+    // No-op for unsigned types
+    out.copy_shared_buffer(in);
+  } else {
+    unary(in, out, detail::Abs());
+  }
+}
+
+void Arange::eval(const std::vector<array>& inputs, array& out) {
+  arange(inputs, out, start_, step_);
+}
+
+void ArcCos::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcCos());
+  } else {
+    throw std::invalid_argument(
+        "[arccos] Cannot compute inverse cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcCosh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcCosh());
+  } else {
+    throw std::invalid_argument(
+        "[arccosh] Cannot compute inverse hyperbolic cosine of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void ArcSin::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcSin());
+  } else {
+    throw std::invalid_argument(
+        "[arcsin] Cannot compute inverse sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcSinh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcSinh());
+  } else {
+    throw std::invalid_argument(
+        "[arcsinh] Cannot compute inverse hyperbolic sine of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void ArcTan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcTan());
+  } else {
+    throw std::invalid_argument(
+        "[arctan] Cannot compute inverse tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcTanh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcTanh());
+  } else {
+    throw std::invalid_argument(
+        "[arctanh] Cannot compute inverse hyperbolic tangent of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void AsType::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  copy(in, out, ctype);
+}
+
+void Ceil::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Ceil());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Concatenate::eval(const std::vector<array>& inputs, array& out) {
+  std::vector<int> sizes;
+  sizes.push_back(0);
+  for (auto& p : inputs) {
+    sizes.push_back(p.shape(axis_));
+  }
+  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto strides = out.strides();
+  auto flags = out.flags();
+  flags.row_contiguous = false;
+  flags.col_contiguous = false;
+  flags.contiguous = false;
+  for (int i = 0; i < inputs.size(); i++) {
+    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
+    size_t data_offset = strides[axis_] * sizes[i];
+    out_slice.copy_shared_buffer(
+        out, strides, flags, out_slice.size(), data_offset);
+    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral);
+  }
+}
+
+void Conjugate::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == complex64) {
+    unary_fp(in, out, detail::Conjugate());
+  } else {
+    throw std::invalid_argument(
+        "[conjugate] conjugate must be called on complex input.");
+  }
+}
+
+void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.flags().row_contiguous ||
+      (allow_col_major_ && in.flags().col_contiguous)) {
+    out.copy_shared_buffer(in);
+  } else {
+    copy(in, out, CopyType::General);
+  }
+}
+
+void Cos::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Cos());
+  } else {
+    throw std::invalid_argument(
+        "[cos] Cannot compute cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Cosh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Cosh());
+  } else {
+    throw std::invalid_argument(
+        "[cosh] Cannot compute hyperbolic cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Erf::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  switch (out.dtype()) {
+    case float32:
+      unary_op<float>(in, out, detail::Erf());
+      break;
+    case float16:
+      unary_op<float16_t>(in, out, detail::Erf());
+      break;
+    case bfloat16:
+      unary_op<bfloat16_t>(in, out, detail::Erf());
+      break;
+    default:
+      throw std::invalid_argument(
+          "[erf] Error function only defined for arrays"
+          " with real floating point type.");
+  }
+}
+
+void ErfInv::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  switch (out.dtype()) {
+    case float32:
+      unary_op<float>(in, out, detail::ErfInv());
+      break;
+    case float16:
+      unary_op<float16_t>(in, out, detail::ErfInv());
+      break;
+    case bfloat16:
+      unary_op<bfloat16_t>(in, out, detail::ErfInv());
+      break;
+    default:
+      throw std::invalid_argument(
+          "[erf_inv] Inverse error function only defined for arrays"
+          " with real floating point type.");
+  }
+}
+
+void Exp::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Exp());
+  } else {
+    throw std::invalid_argument(
+        "[exp] Cannot exponentiate elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Expm1::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Expm1());
+  } else {
+    throw std::invalid_argument(
+        "[expm1] Cannot exponentiate elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Flatten::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
+}
+
+void Unflatten::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
+}
+
+void Floor::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Floor());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Full::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  assert(in.dtype() == out.dtype());
+  CopyType ctype;
+  if (in.data_size() == 1) {
+    ctype = CopyType::Scalar;
+  } else if (in.flags().contiguous) {
+    ctype = CopyType::Vector;
+  } else {
+    ctype = CopyType::General;
+  }
+  copy(in, out, ctype);
+}
+
+void Imag::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Imag());
+}
+
+void Log::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    switch (base_) {
+      case Base::e:
+        unary_fp(in, out, detail::Log());
+        break;
+      case Base::two:
+        unary_fp(in, out, detail::Log2());
+        break;
+      case Base::ten:
+        unary_fp(in, out, detail::Log10());
+        break;
+    }
+  } else {
+    throw std::invalid_argument(
+        "[log] Cannot compute log of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void Log1p::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Log1p());
+  } else {
+    throw std::invalid_argument(
+        "[log1p] Cannot compute log of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::LogicalNot());
+}
+
+void Negative::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::Negative());
+}
+
+void Pad::eval(const std::vector<array>& inputs, array& out) {
+  // Inputs must be base input array and scalar val array
+  assert(inputs.size() == 2);
+  auto& in = inputs[0];
+  auto& val = inputs[1];
+
+  // Padding value must be a scalar
+  assert(val.size() == 1);
+
+  // Padding value, input and output must be of the same type
+  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());
+
+  // Fill output with val
+  copy(val, out, CopyType::Scalar);
+
+  // Find offset for start of input values
+  size_t data_offset = 0;
+  for (int i = 0; i < axes_.size(); i++) {
+    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
+    data_offset += out.strides()[ax] * low_pad_size_[i];
+  }
+
+  // Extract slice from output where input will be pasted
+  array out_slice(in.shape(), out.dtype(), nullptr, {});
+  out_slice.copy_shared_buffer(
+      out, out.strides(), out.flags(), out_slice.size(), data_offset);
+
+  // Copy input values into the slice
+  copy_inplace(in, out_slice, CopyType::GeneralGeneral);
+}
+
+void RandomBits::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  // keys has shape (N1, ..., NK, 2)
+  // out has shape (N1, ..., NK, M1, M2, ...)
+  auto& keys = inputs[0];
+  size_t num_keys = keys.size() / 2;
+
+  size_t elems_per_key = out.size() / num_keys;
+  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto kptr = inputs[0].data<uint32_t>();
+  auto cptr = out.data<char>();
+  size_t out_skip = (bytes_per_key + 4 - 1) / 4;
+  auto half_size = out_skip / 2;
+  bool even = out_skip % 2 == 0;
+  for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
+    auto ptr = reinterpret_cast<uint32_t*>(cptr);
+    // Get ith key
+    auto kidx = 2 * i;
+    auto k1_elem = elem_to_loc(kidx, keys.shape(), keys.strides());
+    auto k2_elem = elem_to_loc(kidx + 1, keys.shape(), keys.strides());
+    auto key = std::make_pair(kptr[k1_elem], kptr[k2_elem]);
+
+    std::pair<uintptr_t, uintptr_t> count{0, half_size + !even};
+    for (; count.first + 1 < half_size; count.first++, count.second++) {
+      std::tie(ptr[count.first], ptr[count.second]) =
+          random::threefry2x32_hash(key, count);
+    }
+    if (count.first < half_size) {
+      auto rb = random::threefry2x32_hash(key, count);
+      ptr[count.first++] = rb.first;
+      if (bytes_per_key % 4 > 0) {
+        std::copy(
+            reinterpret_cast<char*>(&rb.second),
+            reinterpret_cast<char*>(&rb.second) + bytes_per_key % 4,
+            cptr + 4 * count.second);
+      } else {
+        ptr[count.second] = rb.second;
+      }
+    }
+    if (!even) {
+      count.second = 0;
+      ptr[half_size] = random::threefry2x32_hash(key, count).first;
+    }
+  }
+}
+
+void Real::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Real());
+}
+
+void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
+}
+
+void Round::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Round());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Sigmoid::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sigmoid());
+  } else {
+    throw std::invalid_argument(
+        "[sigmoid] Cannot sigmoid of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void Sign::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == bool_) {
+    out.copy_shared_buffer(in);
+  } else {
+    unary(in, out, detail::Sign());
+  }
+}
+
+void Sin::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sin());
+  } else {
+    throw std::invalid_argument(
+        "[sin] Cannot compute sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Sinh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sinh());
+  } else {
+    throw std::invalid_argument(
+        "[sinh] Cannot compute hyperbolic sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Slice::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  auto& in = inputs[0];
+
+  // Calculate out strides, initial offset and if copy needs to be made
+  auto [data_offset, inp_strides] = prepare_slice(in, start_indices_, strides_);
+  size_t data_end = 1;
+  for (int i = 0; i < end_indices_.size(); ++i) {
+    if (in.shape()[i] > 1) {
+      auto end_idx = start_indices_[i] + out.shape()[i] * strides_[i] - 1;
+      data_end += end_idx * in.strides()[i];
+    }
+  }
+  size_t data_size = data_end - data_offset;
+  Strides ostrides{inp_strides.begin(), inp_strides.end()};
+  shared_buffer_slice(in, ostrides, data_offset, data_size, out);
+}
+
+void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  auto& in = inputs[0];
+  auto& upd = inputs[1];
+
+  if (upd.size() == 0) {
+    out.copy_shared_buffer(in);
+    return;
+  }
+
+  // Check if materialization is needed
+  auto ctype = in.flags().contiguous && in.size() == in.data_size()
+      ? CopyType::Vector
+      : CopyType::General;
+  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);
+
+  // Calculate out strides, initial offset and if copy needs to be made
+  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);
+
+  // Do copy
+  Strides upd_strides{upd.strides().begin(), upd.strides().end()};
+  copy_inplace(
+      /* const array& src = */ upd,
+      /* array& dst = */ out,
+      /* const std::vector<int>& data_shape = */ upd.shape(),
+      /* const std::vector<stride_t>& i_strides = */ upd_strides,
+      /* const std::vector<stride_t>& o_strides = */ out_strides,
+      /* int64_t i_offset = */ 0,
+      /* int64_t o_offset = */ data_offset,
+      /* CopyType ctype = */ CopyType::GeneralGeneral);
+}
+
+void Square::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::Square());
+}
+
+void Sqrt::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (recip_) {
+    unary_fp(in, out, detail::Rsqrt());
+  } else {
+    unary_fp(in, out, detail::Sqrt());
+  }
+}
+
+void Tan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Tan());
+  } else {
+    throw std::invalid_argument(
+        "[tan] Cannot compute tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Tanh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Tanh());
+  } else {
+    throw std::invalid_argument(
+        "[tanh] Cannot compute hyperbolic tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void View::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  auto ibytes = size_of(in.dtype());
+  auto obytes = size_of(out.dtype());
+  // Conditions for buffer copying (disjunction):
+  // - type size is the same
+  // - type size is smaller and the last axis is contiguous
+  // - the entire array is row contiguous
+  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
+      in.flags().row_contiguous) {
+    auto strides = in.strides();
+    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
+      strides[i] *= ibytes;
+      strides[i] /= obytes;
+    }
+    out.copy_shared_buffer(
+        in, strides, in.flags(), in.data_size() * ibytes / obytes);
+  } else {
+    auto tmp = array(
+        in.shape(), in.dtype() == bool_ ? uint8 : in.dtype(), nullptr, {});
+    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
+    if (in.dtype() == bool_) {
+      auto in_tmp = array(in.shape(), uint8, nullptr, {});
+      in_tmp.copy_shared_buffer(in);
+      copy_inplace(in_tmp, tmp, CopyType::General);
+    } else {
+      copy_inplace(in, tmp, CopyType::General);
+    }
+
+    auto flags = out.flags();
+    flags.contiguous = true;
+    flags.row_contiguous = true;
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -41,7 +41,7 @@ template <typename T>
 void qrf_impl(const array& a, array& q, array& r) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
-  const int lda = M;
+  const int lda = std::max(M, N);
  size_t num_matrices = a.size() / (M * N);
  int num_reflectors = std::min(M, N);
  auto tau =
@@ -89,16 +89,13 @@ void qrf_impl(const array& a, array& q, array& r) {
  allocator::free(work);

  r.set_data(allocator::malloc_or_wait(r.nbytes()));
+  copy_inplace(in, r, CopyType::General);

  for (int i = 0; i < num_matrices; ++i) {
-    /// num_reflectors x N
+    // Zero lower triangle
    for (int j = 0; j < r.shape(-2); ++j) {
      for (int k = 0; k < j; ++k) {
-        r.data<T>()[i * N * num_reflectors + j * N + k] = 0;
-      }
-      for (int k = j; k < r.shape(-1); ++k) {
-        r.data<T>()[i * N * num_reflectors + j * N + k] =
-            in.data<T>()[i * N * M + j + k * M];
+        r.data<T>()[i * N * M + j * N + k] = 0;
      }
    }
  }
@@ -107,7 +104,7 @@ void qrf_impl(const array& a, array& q, array& r) {
  lwork = -1;
  lpack<T>::xorgqr(
      &M,
-      &num_reflectors,
+      &N,
      &num_reflectors,
      nullptr,
      &lda,
@@ -123,7 +120,7 @@ void qrf_impl(const array& a, array& q, array& r) {
    // Compute Q
    lpack<T>::xorgqr(
        &M,
-        &num_reflectors,
+        &N,
        &num_reflectors,
        in.data<float>() + M * N * i,
        &lda,
@@ -134,24 +131,14 @@ void qrf_impl(const array& a, array& q, array& r) {
  }

  q.set_data(allocator::malloc_or_wait(q.nbytes()));
-  for (int i = 0; i < num_matrices; ++i) {
-    // M x num_reflectors
-    for (int j = 0; j < q.shape(-2); ++j) {
-      for (int k = 0; k < q.shape(-1); ++k) {
-        q.data<T>()[i * M * num_reflectors + j * num_reflectors + k] =
-            in.data<T>()[i * N * M + j + k * M];
-      }
-    }
-  }
+  copy_inplace(in, q, CopyType::General);

  // Cleanup
  allocator::free(work);
  allocator::free(tau);
 }

-void QRF::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void QRF::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  if (!(inputs[0].dtype() == float32)) {
    throw std::runtime_error("[QRF::eval] only supports float32.");
  }
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -151,78 +151,6 @@ void _qmm_t(
  }
 }

-template <int bits, int S>
-simd::Simd<uint32_t, S> extract_bits_simd(const uint32_t* w) {
-  constexpr int bitmask = (1 << bits) - 1;
-  simd::Simd<uint32_t, S> wi;
-  if constexpr (bits == 4 && S == 8) {
-    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
-    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
-    wi = simd::Simd<uint32_t, S>(*w);
-    wi = wi >> shifts;
-    wi = wi & bitmask;
-  } else if constexpr (bits == 8 && S == 8) {
-    constexpr std::array<uint32_t, 8> shifts_ = {{0, 8, 16, 24, 0, 8, 16, 24}};
-    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
-    auto l = simd::Simd<uint32_t, 4>(*w++);
-    auto r = simd::Simd<uint32_t, 4>(*w);
-    wi = simd::Simd<uint32_t, S>(l, r);
-    wi = wi >> shifts;
-    wi = wi & bitmask;
-  } else {
-    // Appease compiler.. but should never get here
-    throw std::runtime_error("Unsupported combination for simd qmm.");
-  }
-  return wi;
-}
-
-template <typename T, int bits, int group_size>
-void _qmm_t_simd(
-    T* result,
-    const T* x,
-    const uint32_t* w,
-    const T* scales,
-    const T* biases,
-    int M,
-    int N,
-    int K) {
-  constexpr int pack_factor = 32 / bits;
-  constexpr int packs_in_group = group_size / pack_factor;
-  constexpr int S = simd::max_size<T>;
-  static_assert(
-      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
-  constexpr int packs_per_simd = S / pack_factor;
-
-  for (int m = 0; m < M; m++) {
-    const uint32_t* w_local = w;
-    const T* scales_local = scales;
-    const T* biases_local = biases;
-
-    for (int n = 0; n < N; n++) {
-      simd::Simd<float, S> acc(0);
-      auto x_local = x;
-      for (int k = 0; k < K; k += group_size) {
-        T scale = *scales_local++;
-        T bias = *biases_local++;
-
-        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
-          auto wf = simd::Simd<float, S>(extract_bits_simd<bits, S>(w_local));
-          w_local += packs_per_simd;
-          wf = wf * scale;
-          wf = wf + bias;
-          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
-          acc = acc + x_simd * wf;
-          x_local += S;
-        }
-      }
-
-      *result = T(simd::sum(acc));
-      result++;
-    }
-    x += K;
-  }
-}
-
 template <typename T, int bits, int group_size>
 void _qmm_dispatch_transpose(
    T* result,
@@ -235,14 +163,9 @@ void _qmm_dispatch_transpose(
    int K,
    bool transposed_w) {
  if (transposed_w) {
-    // the simd size must be a multiple of the number of elements per word
-    if constexpr (32 % bits == 0 && simd::max_size<T> % (32 / bits) == 0) {
-      _qmm_t_simd<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
-    } else {
-      _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
-    }
+    return _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
  } else {
-    _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
+    return _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
  }
 }

@@ -326,13 +249,13 @@ void _qmm_dispatch(
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
-  int M = x.ndim() > 1 ? x.shape(-2) : 1;
+  int M = x.shape(-2);
  int N = out.shape(-1);

  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;

-  int batch_size = x.size() / (K * M);
+  int batch_size = x.size() / x.shape(-1) / x.shape(-2);
  for (int i = 0; i < batch_size; i++) {
    switch (x.dtype()) {
      case float32:
@@ -461,7 +384,7 @@ void _bs_qmm_dispatch(

 } // namespace

-void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 4);

  auto& x_pre = inputs[0];
@@ -488,7 +411,7 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }

-void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 6);

  auto& x_pre = inputs[0];
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -1,147 +1,312 @@
-// Copyright © 2024 Apple Inc.
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+#include <functional>
+#include <limits>

 #include "mlx/backend/common/reduce.h"
+#include "mlx/primitives.h"

 namespace mlx::core {

-std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
-    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
+namespace {

-  for (int i = axes.size() - 1; i >= 0; i--) {
-    int a = axes[i];
-    shape.erase(shape.begin() + a);
-    strides.erase(strides.begin() + a);
+template <typename U>
+struct Limits {
+  static const U max;
+  static const U min;
+};
+
+#define instantiate_default_limit(type)                           \
+  template <>                                                     \
+  struct Limits<type> {                                           \
+    static constexpr type max = std::numeric_limits<type>::max(); \
+    static constexpr type min = std::numeric_limits<type>::min(); \
+  };
+
+instantiate_default_limit(uint8_t);
+instantiate_default_limit(uint16_t);
+instantiate_default_limit(uint32_t);
+instantiate_default_limit(uint64_t);
+instantiate_default_limit(int8_t);
+instantiate_default_limit(int16_t);
+instantiate_default_limit(int32_t);
+instantiate_default_limit(int64_t);
+
+#define instantiate_float_limit(type) \
+  template <>                         \
+  struct Limits<type> {               \
+    static const type max;            \
+    static const type min;            \
+  };
+
+instantiate_float_limit(float16_t);
+instantiate_float_limit(bfloat16_t);
+instantiate_float_limit(float);
+instantiate_float_limit(complex64_t);
+
+template <>
+struct Limits<bool> {
+  static constexpr bool max = true;
+  static constexpr bool min = false;
+};
+
+const float Limits<float>::max = std::numeric_limits<float>::infinity();
+const float Limits<float>::min = -std::numeric_limits<float>::infinity();
+const bfloat16_t Limits<bfloat16_t>::max =
+    std::numeric_limits<float>::infinity();
+const bfloat16_t Limits<bfloat16_t>::min =
+    -std::numeric_limits<float>::infinity();
+const float16_t Limits<float16_t>::max = std::numeric_limits<float>::infinity();
+const float16_t Limits<float16_t>::min =
+    -std::numeric_limits<float>::infinity();
+const complex64_t Limits<complex64_t>::max =
+    std::numeric_limits<float>::infinity();
+const complex64_t Limits<complex64_t>::min =
+    -std::numeric_limits<float>::infinity();
+
+struct AndReduce {
+  template <typename T>
+  void operator()(bool* a, T b) {
+    (*a) &= (b != 0);
  }

-  return std::make_pair(shape, strides);
+  void operator()(bool* y, bool x) {
+    (*y) &= x;
+  }
+};
+
+struct OrReduce {
+  template <typename T>
+  void operator()(bool* a, T b) {
+    (*a) |= (b != 0);
+  }
+
+  void operator()(bool* y, bool x) {
+    (*y) |= x;
+  }
+};
+
+struct MaxReduce {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
+    (*y) = (*y > x) ? *y : x;
+  };
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
+    if (std::isnan(x)) {
+      *y = x;
+    } else {
+      (*y) = (*y > x) ? *y : x;
+    }
+  };
+};
+
+struct MinReduce {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
+    (*y) = (*y < x) ? *y : x;
+  };
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
+    if (std::isnan(x)) {
+      *y = x;
+    } else {
+      (*y) = (*y < x) ? *y : x;
+    }
+  };
+};
+
+template <typename InT>
+void reduce_dispatch_and_or(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::And) {
+    reduction_op<InT, bool>(in, out, axes, true, AndReduce());
+  } else {
+    reduction_op<InT, bool>(in, out, axes, false, OrReduce());
+  }
 }

-ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
-  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
-      x.flags().contiguous) {
-    return ContiguousAllReduce;
+template <typename InT>
+void reduce_dispatch_sum_prod(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Sum) {
+    auto op = [](auto y, auto x) { (*y) = (*y) + x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 0, op);
+    } else {
+      reduction_op<InT, InT>(in, out, axes, 0, op);
+    }
+  } else {
+    auto op = [](auto y, auto x) { (*y) *= x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 1, op);
+    } else {
+      reduction_op<InT, InT>(in, out, axes, 1, op);
+    }
  }
+}

-  // Row contiguous input so the output is row contiguous
-  if (x.flags().row_contiguous) {
-    // Merge consecutive axes
-    Shape shape = {x.shape(axes[0])};
-    Strides strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
-      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
-        shape.back() *= x.shape(axes[i]);
-        strides.back() = x.strides()[axes[i]];
-      } else {
-        shape.push_back(x.shape(axes[i]));
-        strides.push_back(x.strides()[axes[i]]);
+template <typename InT>
+void reduce_dispatch_min_max(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Max) {
+    auto init = Limits<InT>::min;
+    reduction_op<InT, InT>(in, out, axes, init, MaxReduce());
+  } else {
+    auto init = Limits<InT>::max;
+    reduction_op<InT, InT>(in, out, axes, init, MinReduce());
+  }
+}
+
+} // namespace
+
+void nd_loop(
+    std::function<void(int)> callback,
+    const Shape& shape,
+    const Strides& strides) {
+  std::function<void(int, int)> loop_inner;
+  loop_inner = [&](int dim, int offset) {
+    if (dim < shape.size() - 1) {
+      auto size = shape[dim];
+      auto stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        loop_inner(dim + 1, offset + i * stride);
+      }
+    } else {
+      auto size = shape[dim];
+      auto stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        callback(offset + i * stride);
      }
    }
+  };
+  loop_inner(0, 0);
+}

-    // Remove singleton axes from the plan
-    for (int i = shape.size() - 1; i >= 0; i--) {
-      if (shape[i] == 1) {
-        shape.erase(shape.begin() + i);
-        strides.erase(strides.begin() + i);
+void Reduce::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  switch (reduce_type_) {
+    case Reduce::And:
+    case Reduce::Or: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_and_or<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+        case float16:
+        case bfloat16:
+          reduce_dispatch_and_or<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+        case int32:
+        case float32:
+          reduce_dispatch_and_or<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+        case int64:
+        case complex64:
+          reduce_dispatch_and_or<int64_t>(in, out, reduce_type_, axes_);
+          break;
      }
+      break;
    }
-
-    if (strides.back() == 1) {
-      return ReductionPlan(ContiguousReduce, shape, strides);
-    } else if (strides.back() > 1) {
-      return ReductionPlan(ContiguousStridedReduce, shape, strides);
-    }
-  }
-
-  // Let's check if we can optimize our access patterns
-  //
-  // 1. We have a reduction axis with stride 1. Simply call
-  //    GeneralContiguousReduce and be done with it.
-  // 2. We have transpositions and we are not reducing over the axis with
-  //    stride 1. However, we are reducing over an axis where everything is
-  //    contiguous in memory to the right of that axis. We can call strided
-  //    reduce and be done with it.
-  // 2. We have weird transpositions and expands. Copy the strides to the
-  //    output, then call strided reduce.
-
-  // Sort reduction axes by stride in order to merge them and figure out if we
-  // have a contiguous reduction.
-  std::vector<std::pair<int, int64_t>> reductions;
-  for (auto a : axes) {
-    if (x.shape(a) > 1) {
-      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
-    }
-  }
-  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
-    bool a_is_zero = a.second == 0;
-    bool b_is_zero = b.second == 0;
-    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
-  });
-  // Extract the two smallest and try to merge them in case the contiguous
-  // reduction can be bigger than just the last axis.
-  for (int i = reductions.size() - 1; i >= 1; i--) {
-    auto a = reductions[i];
-    auto b = reductions[i - 1];
-
-    // b.stride = a.shape * a.stride then a and b are contiguous
-    if (b.second == a.first * a.second) {
-      reductions.erase(reductions.begin() + i);
-      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
-    }
-  }
-
-  Shape shape;
-  Strides strides;
-  for (auto r : reductions) {
-    shape.push_back(r.first);
-    strides.push_back(r.second);
-  }
-
-  // We can call the contiguous reduction op for every weird way the input is
-  // structured in the rest of the axes.
-  if (strides.back() == 1) {
-    return ReductionPlan(GeneralContiguousReduce, shape, strides);
-  }
-
-  // Delegate to the general strided reduction op if the axes after
-  // strides.back() are contiguous.
-  if (strides.back() > 1) {
-    int64_t size = 1;
-    bool have_expand = false;
-    for (int i = x.ndim() - 1; i >= 0; i--) {
-      if (axes.back() == i) {
-        continue;
+    case Reduce::Sum:
+    case Reduce::Prod: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+          reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+        case uint32:
+          reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+        case uint64:
+          reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_sum_prod<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_sum_prod<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_sum_prod<float>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_sum_prod<complex64_t>(in, out, reduce_type_, axes_);
+          break;
      }
-
-      auto stride_i = x.strides()[i];
-      auto shape_i = x.shape(i);
-      if (stride_i == 0) {
-        if (shape_i == 1) {
-          continue;
-        }
-
-        have_expand = true;
-        break;
-      }
-
-      if (stride_i != size && shape_i != 1) {
-        break;
-      }
-      size *= shape_i;
+      break;
    }
-    // In the case of an expanded dimension we are being conservative and
-    // require the smallest reduction stride to be smaller than the maximum row
-    // contiguous size. The reason is that we can't easily know if the reduced
-    // axis is before or after an expanded dimension.
-    if (size > strides.back() || (size == strides.back() && !have_expand)) {
-      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    case Reduce::Max:
+    case Reduce::Min: {
+      switch (in.dtype()) {
+        case bool_:
+          reduce_dispatch_min_max<bool>(in, out, reduce_type_, axes_);
+          break;
+        case uint8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+          reduce_dispatch_min_max<uint32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+          reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
+          break;
+        case int8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+          reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+          reduce_dispatch_min_max<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_min_max<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_min_max<float>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_min_max<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_min_max<complex64_t>(in, out, reduce_type_, axes_);
+          break;
+      }
+      break;
    }
  }
-
-  return ReductionPlan(GeneralReduce, shape, strides);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -48,8 +48,186 @@ struct ReductionPlan {

 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);

+// Helper for the ndimensional strided loop
+// Should this be in utils?
+void nd_loop(
+    std::function<void(int)> callback,
+    const Shape& shape,
+    const Strides& strides);
+
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);

+template <typename T, typename U, typename Op>
+struct DefaultStridedReduce {
+  Op op;
+
+  DefaultStridedReduce(Op op_) : op(op_) {}
+
+  void operator()(const T* x, U* accumulator, int size, size_t stride) {
+    for (int i = 0; i < size; i++) {
+      U* moving_accumulator = accumulator;
+      for (int j = 0; j < stride; j++) {
+        op(moving_accumulator, *x);
+        moving_accumulator++;
+        x++;
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultContiguousReduce {
+  Op op;
+
+  DefaultContiguousReduce(Op op_) : op(op_) {}
+
+  void operator()(const T* x, U* accumulator, int size) {
+    while (size-- > 0) {
+      op(accumulator, *x);
+      x++;
+    }
+  }
+};
+
+template <typename T, typename U, typename OpS, typename OpC, typename Op>
+void reduction_op(
+    const array& x,
+    array& out,
+    const std::vector<int>& axes,
+    U init,
+    OpS ops,
+    OpC opc,
+    Op op) {
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  ReductionPlan plan = get_reduction_plan(x, axes);
+
+  if (plan.type == ContiguousAllReduce) {
+    U* out_ptr = out.data<U>();
+    *out_ptr = init;
+    opc(x.data<T>(), out_ptr, x.size());
+    return;
+  }
+
+  if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
+    int reduction_size = plan.shape[0];
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    for (int i = 0; i < out.size(); i++, out_ptr++, x_ptr += reduction_size) {
+      *out_ptr = init;
+      opc(x_ptr, out_ptr, reduction_size);
+    }
+    return;
+  }
+
+  if (plan.type == GeneralContiguousReduce || plan.type == ContiguousReduce) {
+    int reduction_size = plan.shape.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    // Unrolling the following loop (and implementing it in order for
+    // ContiguousReduce) should hold extra performance boost.
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    if (plan.shape.size() == 0) {
+      for (int i = 0; i < out.size(); i++, out_ptr++) {
+        int offset = elem_to_loc(i, shape, strides);
+        *out_ptr = init;
+        opc(x_ptr + offset, out_ptr, reduction_size);
+      }
+    } else {
+      for (int i = 0; i < out.size(); i++, out_ptr++) {
+        int offset = elem_to_loc(i, shape, strides);
+        *out_ptr = init;
+        nd_loop(
+            [&](int extra_offset) {
+              opc(x_ptr + offset + extra_offset, out_ptr, reduction_size);
+            },
+            plan.shape,
+            plan.strides);
+      }
+    }
+    return;
+  }
+
+  if (plan.type == ContiguousStridedReduce && plan.shape.size() == 1) {
+    int reduction_size = plan.shape.back();
+    size_t reduction_stride = plan.strides.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    for (int i = 0; i < out.size(); i += reduction_stride) {
+      std::fill_n(out_ptr, reduction_stride, init);
+      ops(x_ptr, out_ptr, reduction_size, reduction_stride);
+      x_ptr += reduction_stride * reduction_size;
+      out_ptr += reduction_stride;
+    }
+    return;
+  }
+
+  if (plan.type == GeneralStridedReduce ||
+      plan.type == ContiguousStridedReduce) {
+    int reduction_size = plan.shape.back();
+    size_t reduction_stride = plan.strides.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    if (plan.shape.size() == 0) {
+      for (int i = 0; i < out.size(); i += reduction_stride) {
+        int offset = elem_to_loc(i, shape, strides);
+        std::fill_n(out_ptr, reduction_stride, init);
+        ops(x_ptr + offset, out_ptr, reduction_size, reduction_stride);
+        out_ptr += reduction_stride;
+      }
+    } else {
+      for (int i = 0; i < out.size(); i += reduction_stride) {
+        int offset = elem_to_loc(i, shape, strides);
+        std::fill_n(out_ptr, reduction_stride, init);
+        nd_loop(
+            [&](int extra_offset) {
+              ops(x_ptr + offset + extra_offset,
+                  out_ptr,
+                  reduction_size,
+                  reduction_stride);
+            },
+            plan.shape,
+            plan.strides);
+        out_ptr += reduction_stride;
+      }
+    }
+    return;
+  }
+
+  if (plan.type == GeneralReduce) {
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    for (int i = 0; i < out.size(); i++, out_ptr++) {
+      int offset = elem_to_loc(i, shape, strides);
+      U val = init;
+      nd_loop(
+          [&](int extra_offset) { op(&val, *(x_ptr + offset + extra_offset)); },
+          plan.shape,
+          plan.strides);
+      *out_ptr = val;
+    }
+  }
+}
+
+template <typename T, typename U, typename Op>
+void reduction_op(
+    const array& x,
+    array& out,
+    const std::vector<int>& axes,
+    U init,
+    Op op) {
+  DefaultStridedReduce<T, U, Op> ops(op);
+  DefaultContiguousReduce<T, U, Op> opc(op);
+  reduction_op<T, U>(x, out, axes, init, ops, opc, op);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -0,0 +1,147 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/reduce.h"
+
+namespace mlx::core {
+
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+
+  for (int i = axes.size() - 1; i >= 0; i--) {
+    int a = axes[i];
+    shape.erase(shape.begin() + a);
+    strides.erase(strides.begin() + a);
+  }
+
+  return std::make_pair(shape, strides);
+}
+
+ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
+  // The data is all there and we are reducing over everything
+  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+      x.flags().contiguous) {
+    return ContiguousAllReduce;
+  }
+
+  // Row contiguous input so the output is row contiguous
+  if (x.flags().row_contiguous) {
+    // Merge consecutive axes
+    Shape shape = {x.shape(axes[0])};
+    Strides strides = {x.strides()[axes[0]]};
+    for (int i = 1; i < axes.size(); i++) {
+      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
+        shape.back() *= x.shape(axes[i]);
+        strides.back() = x.strides()[axes[i]];
+      } else {
+        shape.push_back(x.shape(axes[i]));
+        strides.push_back(x.strides()[axes[i]]);
+      }
+    }
+
+    // Remove singleton axes from the plan
+    for (int i = shape.size() - 1; i >= 0; i--) {
+      if (shape[i] == 1) {
+        shape.erase(shape.begin() + i);
+        strides.erase(strides.begin() + i);
+      }
+    }
+
+    if (strides.back() == 1) {
+      return ReductionPlan(ContiguousReduce, shape, strides);
+    } else if (strides.back() > 1) {
+      return ReductionPlan(ContiguousStridedReduce, shape, strides);
+    }
+  }
+
+  // Let's check if we can optimize our access patterns
+  //
+  // 1. We have a reduction axis with stride 1. Simply call
+  //    GeneralContiguousReduce and be done with it.
+  // 2. We have transpositions and we are not reducing over the axis with
+  //    stride 1. However, we are reducing over an axis where everything is
+  //    contiguous in memory to the right of that axis. We can call strided
+  //    reduce and be done with it.
+  // 2. We have weird transpositions and expands. Copy the strides to the
+  //    output, then call strided reduce.
+
+  // Sort reduction axes by stride in order to merge them and figure out if we
+  // have a contiguous reduction.
+  std::vector<std::pair<int, int64_t>> reductions;
+  for (auto a : axes) {
+    if (x.shape(a) > 1) {
+      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
+    }
+  }
+  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
+    bool a_is_zero = a.second == 0;
+    bool b_is_zero = b.second == 0;
+    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
+  });
+  // Extract the two smallest and try to merge them in case the contiguous
+  // reduction can be bigger than just the last axis.
+  for (int i = reductions.size() - 1; i >= 1; i--) {
+    auto a = reductions[i];
+    auto b = reductions[i - 1];
+
+    // b.stride = a.shape * a.stride then a and b are contiguous
+    if (b.second == a.first * a.second) {
+      reductions.erase(reductions.begin() + i);
+      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
+    }
+  }
+
+  Shape shape;
+  Strides strides;
+  for (auto r : reductions) {
+    shape.push_back(r.first);
+    strides.push_back(r.second);
+  }
+
+  // We can call the contiguous reduction op for every weird way the input is
+  // structured in the rest of the axes.
+  if (strides.back() == 1) {
+    return ReductionPlan(GeneralContiguousReduce, shape, strides);
+  }
+
+  // Delegate to the general strided reduction op if the axes after
+  // strides.back() are contiguous.
+  if (strides.back() > 1) {
+    int64_t size = 1;
+    bool have_expand = false;
+    for (int i = x.ndim() - 1; i >= 0; i--) {
+      if (axes.back() == i) {
+        continue;
+      }
+
+      auto stride_i = x.strides()[i];
+      auto shape_i = x.shape(i);
+      if (stride_i == 0) {
+        if (shape_i == 1) {
+          continue;
+        }
+
+        have_expand = true;
+        break;
+      }
+
+      if (stride_i != size && shape_i != 1) {
+        break;
+      }
+      size *= shape_i;
+    }
+    // In the case of an expanded dimension we are being conservative and
+    // require the smallest reduction stride to be smaller than the maximum row
+    // contiguous size. The reason is that we can't easily know if the reduced
+    // axis is before or after an expanded dimension.
+    if (size > strides.back() || (size == strides.back() && !have_expand)) {
+      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    }
+  }
+
+  return ReductionPlan(GeneralReduce, shape, strides);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/scan.cpp
+++ b/mlx/backend/common/scan.cpp
@@ -0,0 +1,325 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename U, typename Op>
+struct DefaultContiguousScan {
+  Op op;
+  U init;
+
+  DefaultContiguousScan(Op op_, U init_) : op(op_), init(init_) {}
+
+  void operator()(
+      const T* input,
+      U* output,
+      int count,
+      int stride,
+      bool reverse,
+      bool inclusive) {
+    if (!reverse) {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          *output = *input;
+          for (int j = 1; j < stride; j++) {
+            input++;
+            output++;
+            op(output, output - 1, input);
+          }
+          output++;
+          input++;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          *output = init;
+          for (int j = 1; j < stride; j++) {
+            op(output + 1, output, input);
+            input++;
+            output++;
+          }
+          output++;
+          input++;
+        }
+      }
+    } else {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          output += stride - 1;
+          input += stride - 1;
+          *output = *input;
+          for (int j = 1; j < stride; j++) {
+            input--;
+            output--;
+            op(output, output + 1, input);
+          }
+          output += stride;
+          input += stride;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          output += stride - 1;
+          input += stride - 1;
+          *output = init;
+          for (int j = 1; j < stride; j++) {
+            op(output - 1, output, input);
+            input--;
+            output--;
+          }
+          output += stride;
+          input += stride;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultStridedScan {
+  Op op;
+  U init;
+
+  DefaultStridedScan(Op op_, U init_) : op(op_), init(init_) {}
+
+  void operator()(
+      const T* input,
+      U* output,
+      int count,
+      int size,
+      int stride,
+      bool reverse,
+      bool inclusive) {
+    // TODO: Vectorize the following naive implementation
+    if (!reverse) {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          std::copy(input, input + stride, output);
+          output += stride;
+          input += stride;
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              op(output, output - stride, input);
+              output++;
+              input++;
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          std::fill(output, output + stride, init);
+          output += stride;
+          input += stride;
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              op(output, output - stride, input - stride);
+              output++;
+              input++;
+            }
+          }
+        }
+      }
+    } else {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          output += (size - 1) * stride;
+          input += (size - 1) * stride;
+          std::copy(input, input + stride, output);
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              output--;
+              input--;
+              op(output, output + stride, input);
+            }
+          }
+          output += size * stride;
+          input += size * stride;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          output += (size - 1) * stride;
+          input += (size - 1) * stride;
+          std::fill(output, output + stride, init);
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              output--;
+              input--;
+              op(output, output + stride, input + stride);
+            }
+          }
+          output += size * stride;
+          input += size * stride;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename OpCS, typename OpSS>
+void scan_op(
+    OpCS opcs,
+    OpSS opss,
+    const array& input,
+    array& output,
+    int axis,
+    bool reverse,
+    bool inclusive) {
+  output.set_data(allocator::malloc_or_wait(output.nbytes()));
+
+  if (input.flags().row_contiguous) {
+    if (input.strides()[axis] == 1) {
+      opcs(
+          input.data<T>(),
+          output.data<U>(),
+          input.size() / input.shape(axis),
+          input.shape(axis),
+          reverse,
+          inclusive);
+    } else {
+      opss(
+          input.data<T>(),
+          output.data<U>(),
+          input.size() / input.shape(axis) / input.strides()[axis],
+          input.shape(axis),
+          input.strides()[axis],
+          reverse,
+          inclusive);
+    }
+  } else {
+    throw std::runtime_error("Scan op supports only contiguous inputs");
+  }
+}
+
+template <typename T, typename U>
+void scan_dispatch(
+    Scan::ReduceType rtype,
+    const array& input,
+    array& output,
+    int axis,
+    bool reverse,
+    bool inclusive) {
+  switch (rtype) {
+    case Scan::Sum: {
+      auto op = [](U* o, const U* y, const T* x) { *o = *y + *x; };
+      auto init = static_cast<U>(0);
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Prod: {
+      auto op = [](U* o, const U* y, const T* x) { *o = *y * (*x); };
+      auto init = static_cast<U>(1);
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Min: {
+      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *x : *y; };
+      auto init = (issubdtype(input.dtype(), floating))
+          ? static_cast<U>(std::numeric_limits<float>::infinity())
+          : std::numeric_limits<U>::max();
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Max: {
+      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *y : *x; };
+      auto init = (issubdtype(input.dtype(), floating))
+          ? static_cast<U>(-std::numeric_limits<float>::infinity())
+          : std::numeric_limits<U>::min();
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+  }
+}
+
+} // namespace
+
+void Scan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  // Ensure contiguity
+  auto in = inputs[0];
+  if (!in.flags().row_contiguous) {
+    array arr_copy(in.shape(), in.dtype(), nullptr, {});
+    copy(in, arr_copy, CopyType::General);
+    in = arr_copy;
+  }
+
+  switch (in.dtype()) {
+    case bool_: {
+      // We could do a full dtype x dtype switch but this is the only case
+      // where we accumulate in a different type, for now.
+      //
+      // TODO: If we add the option to accumulate floats in higher precision
+      //       floats perhaps we should add the full all-to-all dispatch.
+      if (reduce_type_ == Scan::Sum && out.dtype() == int32) {
+        scan_dispatch<bool, int32_t>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
+      } else {
+        scan_dispatch<bool, bool>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
+      }
+      break;
+    }
+    case uint8:
+      scan_dispatch<uint8_t, uint8_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint16:
+      scan_dispatch<uint16_t, uint16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint32:
+      scan_dispatch<uint32_t, uint32_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint64:
+      scan_dispatch<uint64_t, uint64_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int8:
+      scan_dispatch<int8_t, int8_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int16:
+      scan_dispatch<int16_t, int16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int32:
+      scan_dispatch<int32_t, int32_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int64:
+      scan_dispatch<int64_t, int64_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case float16:
+      scan_dispatch<float16_t, float16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case float32:
+      scan_dispatch<float, float>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case bfloat16:
+      scan_dispatch<bfloat16_t, bfloat16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case complex64:
+      throw std::runtime_error("Scan ops do not support complex types yet");
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/select.cpp
+++ b/mlx/backend/common/select.cpp
@@ -2,8 +2,7 @@

 #include <cassert>

-#include "mlx/backend/cpu/binary_ops.h"
-#include "mlx/backend/cpu/ternary.h"
+#include "mlx/backend/common/ternary.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -62,7 +61,7 @@ void select_op(

 } // namespace

-void Select::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Select::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);
  const auto& condition = inputs[0];
  const auto& a = inputs[1];
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -35,29 +35,4 @@ void shared_buffer_slice(
  move_or_copy(in, out, out_strides, flags, data_size, data_offset);
 }

-void slice(
-    const array& in,
-    array& out,
-    const Shape& start_indices,
-    const Shape& strides) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-
-  // Calculate out strides, initial offset
-  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
-  int64_t data_end = 1;
-  for (int i = 0; i < start_indices.size(); ++i) {
-    if (in.shape()[i] > 1) {
-      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
-      data_end += end_idx * in.strides()[i];
-    }
-  }
-  // data_end can be -1
-  size_t data_size =
-      data_end < 0 ? (data_offset - data_end) : (data_end - data_offset);
-  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -11,10 +11,11 @@ std::tuple<int64_t, Strides> prepare_slice(
    const Shape& start_indices,
    const Shape& strides);

-void slice(
+void shared_buffer_slice(
    const array& in,
-    array& out,
-    const Shape& start_indices,
-    const Shape& strides);
+    const Strides& out_strides,
+    size_t data_offset,
+    size_t data_size,
+    array& out);

 } // namespace mlx::core
--- a/mlx/backend/common/softmax.cpp
+++ b/mlx/backend/common/softmax.cpp
@@ -3,108 +3,62 @@
 #include <cassert>
 #include <cmath>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"
+#include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 namespace {

-using namespace mlx::core::simd;
-
 template <typename T, typename AccT>
 void softmax(const array& in, array& out) {
-  constexpr bool same_t = std::is_same_v<T, AccT>;
-  constexpr int N = std::min(max_size<AccT>, max_size<T>);
-
  const T* in_ptr = in.data<T>();
  T* out_ptr = out.data<T>();
-  int M = in.shape().back();
-  int L = in.data_size() / M;
+  int N = in.shape().back();
+  int M = in.data_size() / N;
  const T* current_in_ptr;
  T* current_out_ptr;

-  for (int i = 0; i < L; i++, in_ptr += M, out_ptr += M) {
+  for (int i = 0; i < M; i++, in_ptr += N, out_ptr += N) {
    // Find the maximum
    current_in_ptr = in_ptr;
-    Simd<AccT, N> vmaximum(-std::numeric_limits<float>::infinity());
-    size_t s = M;
-    while (s >= N) {
-      Simd<AccT, N> vals = load<T, N>(current_in_ptr);
-      vmaximum = maximum(vals, vmaximum);
-      current_in_ptr += N;
-      s -= N;
-    }
-
-    AccT maximum = max(vmaximum);
-    while (s-- > 0) {
-      maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
-      current_in_ptr++;
+    AccT maximum = *current_in_ptr;
+    for (int j = 0; j < N; j++, current_in_ptr++) {
+      maximum = (maximum < *current_in_ptr) ? static_cast<AccT>(*current_in_ptr)
+                                            : maximum;
    }

    // Compute the normalizer and the exponentials
-    Simd<AccT, N> vnormalizer(0.0);
+    AccT normalizer = 0;
    current_out_ptr = out_ptr;
    current_in_ptr = in_ptr;
-    s = M;
-    while (s >= N) {
-      Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
-      vexp = exp(vexp - maximum);
-      if constexpr (same_t) {
-        store(current_out_ptr, vexp);
+    for (int j = 0; j < N; j++, current_out_ptr++, current_in_ptr++) {
+      AccT expv = std::exp(*current_in_ptr - maximum);
+      normalizer += expv;
+      if constexpr (std::is_same<T, AccT>::value) {
+        *current_out_ptr = expv;
      }
-      vnormalizer = vnormalizer + vexp;
-      current_in_ptr += N;
-      current_out_ptr += N;
-      s -= N;
-    }
-    AccT normalizer = sum(vnormalizer);
-    while (s-- > 0) {
-      AccT _exp = std::exp(*current_in_ptr - maximum);
-      if constexpr (same_t) {
-        *current_out_ptr = _exp;
-      }
-      normalizer += _exp;
-      current_in_ptr++;
-      current_out_ptr++;
    }
    normalizer = 1 / normalizer;

    // Normalize
-    current_out_ptr = out_ptr;
    current_in_ptr = in_ptr;
-    s = M;
-    while (s >= N) {
-      if constexpr (same_t) {
-        store(
-            current_out_ptr,
-            Simd<T, N>(load<T, N>(current_out_ptr) * normalizer));
-      } else {
-        Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
-        vexp = exp(vexp - maximum) * normalizer;
-        store(current_out_ptr, Simd<T, N>(vexp));
-        current_in_ptr += N;
-      }
-      current_out_ptr += N;
-      s -= N;
-    }
-    while (s-- > 0) {
-      if constexpr (same_t) {
+    current_out_ptr = out_ptr;
+    for (int j = 0; j < N; j++, current_out_ptr++) {
+      if constexpr (std::is_same<T, AccT>::value) {
        *current_out_ptr *= normalizer;
      } else {
-        AccT _exp = std::exp(*current_in_ptr - maximum);
-        *current_out_ptr = static_cast<T>(_exp * normalizer);
+        auto v = std::exp(*current_in_ptr - maximum);
+        *current_out_ptr = static_cast<T>(v * normalizer);
        current_in_ptr++;
      }
-      current_out_ptr++;
    }
  }
 }

 } // namespace

-void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Softmax::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  // Make sure that the last dimension is contiguous
@@ -143,7 +97,7 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
    case int16:
    case int32:
    case int64:
-      throw std::runtime_error(
+      throw std::invalid_argument(
          "Softmax is defined only for floating point types");
      break;
    case float32:
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -5,8 +5,8 @@
 #include <cmath>
 #include <numeric>

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"

 #include "mlx/primitives.h"

@@ -14,10 +14,10 @@ namespace mlx::core {

 namespace {

-template <typename T>
+template <typename T, typename IdxT = int32_t>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
-  using difference_type = int32_t;
+  using difference_type = IdxT;
  using value_type = T;
  using reference = value_type&;
  using pointer = value_type*;
@@ -287,7 +287,7 @@ void argpartition(const array& in, array& out, int axis, int kth) {

 } // namespace

-void ArgSort::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgSort::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -321,7 +321,7 @@ void ArgSort::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Sort::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -355,7 +355,7 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgPartition::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -389,7 +389,7 @@ void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Partition::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -137,9 +137,7 @@ void svd_impl(const array& a, array& u, array& s, array& vt) {
  }
 }

-void SVD::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void SVD::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  if (!(inputs[0].dtype() == float32)) {
    throw std::runtime_error("[SVD::eval] only supports float32.");
  }
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -3,10 +3,12 @@
 #pragma once
 #include "mlx/allocator.h"
 #include "mlx/array.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/backend/common/utils.h"
-
 namespace mlx::core {

+namespace {
+
 // TODO: Add support for more combinations of input types.
 enum class TernaryOpType {
  ScalarScalarScalar,
@@ -14,7 +16,7 @@ enum class TernaryOpType {
  General,
 };

-inline TernaryOpType
+TernaryOpType
 get_ternary_op_type(const array& a, const array& b, const array& c) {
  TernaryOpType topt;
  if (a.data_size() == 1 && b.data_size() == 1 && c.data_size() == 1) {
@@ -31,7 +33,7 @@ get_ternary_op_type(const array& a, const array& b, const array& c) {
  return topt;
 }

-inline void set_ternary_op_output_data(
+void set_ternary_op_output_data(
    const array& a,
    const array& b,
    const array& c,
@@ -65,14 +67,156 @@ inline void set_ternary_op_output_data(
      }
      break;
    case TernaryOpType::General:
-      // Try to donate an input which is row_contiguous
-      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
-            (b.flags().row_contiguous && maybe_donate(b)) ||
-            (c.flags().row_contiguous && maybe_donate(c)))) {
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
-      }
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      break;
  }
 }
+template <typename T1, typename T2, typename T3, typename U, typename Op, int D>
+void ternary_op_dims(
+    const T1* a,
+    const T2* b,
+    const T3* c,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& c_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_c = c_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];
+
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      ternary_op_dims<T1, T2, T3, U, Op, D - 1>(
+          a,
+          b,
+          c,
+          out,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          axis + 1);
+    } else {
+      *out = op(*a, *b, *c);
+    }
+    a += stride_a;
+    b += stride_b;
+    c += stride_c;
+    out += stride_out;
+  }
+}
+
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    const array& c,
+    array& out,
+    Op op) {
+  auto [shape, strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), c.strides(), out.strides()});
+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& c_strides = strides[2];
+  const auto& out_strides = strides[3];
+
+  const T1* a_ptr = a.data<T1>();
+  const T2* b_ptr = b.data<T2>();
+  const T3* c_ptr = c.data<T3>();
+  U* out_ptr = out.data<T3>();
+  int ndim = shape.size();
+  switch (ndim) {
+    case 1:
+      ternary_op_dims<T1, T2, T3, U, Op, 1>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      ternary_op_dims<T1, T2, T3, U, Op, 2>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+  }
+
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  ContiguousIterator c_it(shape, c_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
+  for (size_t elem = 0; elem < a.size(); elem += stride) {
+    ternary_op_dims<T1, T2, T3, U, Op, 2>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        c_ptr + c_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        c_strides,
+        out_strides,
+        ndim - 2);
+    a_it.step();
+    b_it.step();
+    c_it.step();
+  }
+}
+
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op(
+    const array& a,
+    const array& b,
+    const array& c,
+    array& out,
+    Op op) {
+  TernaryOpType topt = get_ternary_op_type(a, b, c);
+  set_ternary_op_output_data(a, b, c, out, topt);
+
+  // The full computation is scalar-scalar-scalar so we call the base op once.
+  if (topt == TernaryOpType::ScalarScalarScalar) {
+    *(out.data<U>()) = op(*a.data<T1>(), *b.data<T2>(), *c.data<T3>());
+  } else if (topt == TernaryOpType::VectorVectorVector) {
+    const T1* a_ptr = a.data<T1>();
+    const T2* b_ptr = b.data<T2>();
+    const T3* c_ptr = c.data<T3>();
+    U* out_ptr = out.data<U>();
+    for (size_t i = 0; i < out.size(); ++i) {
+      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
+      a_ptr++;
+      b_ptr++;
+      c_ptr++;
+      out_ptr++;
+    }
+  } else {
+    ternary_op_dispatch_dims<T1, T2, T3, U>(a, b, c, out, op);
+  }
+}
+
+} // namespace

 } // namespace mlx::core
--- a/mlx/backend/common/threefry.cpp
+++ b/mlx/backend/common/threefry.cpp
@@ -1,6 +1,6 @@
 // Copyright © 2023 Apple Inc.

-#include "mlx/backend/cpu/threefry.h"
+#include "mlx/backend/common/threefry.h"

 namespace mlx::core::random {

--- a/mlx/backend/common/threefry.h
+++ b/mlx/backend/common/threefry.h
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -5,11 +5,12 @@
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

+namespace {
+
 void set_unary_output_data(const array& in, array& out) {
  if (is_donatable(in, out)) {
    out.copy_shared_buffer(in);
@@ -37,19 +38,8 @@ void unary_op(const array& a, array& out, Op op) {
  if (a.flags().contiguous) {
    set_unary_output_data(a, out);
    U* dst = out.data<U>();
-    constexpr int N = simd::max_size<T>;
-    size_t size = a.data_size();
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a_ptr)));
-      size -= N;
-      a_ptr += N;
-      dst += N;
-    }
-    while (size > 0) {
-      *dst = op(*a_ptr);
-      size--;
-      dst++;
-      a_ptr++;
+    for (size_t i = 0; i < a.data_size(); ++i) {
+      dst[i] = op(a_ptr[i]);
    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
@@ -135,4 +125,6 @@ void unary_fp(const array& a, array& out, Op op) {
  }
 }

+} // namespace
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -107,7 +107,7 @@ struct ContiguousIterator {
      : shape_(a.shape()), strides_(a.strides()) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
-      pos_ = Shape(shape_.size(), 0);
+      pos_ = std::vector<int>(shape_.size(), 0);
    }
  }

--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -1,81 +0,0 @@
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-  set(COMPILER ${CMAKE_C_COMPILER})
-  set(CLANG TRUE)
-else()
-  set(COMPILER ${CMAKE_CXX_COMPILER})
-endif()
-
-set(COMPILE_DEPS
-    ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
-    simd/simd.h
-    simd/base_simd.h
-    simd/math.h
-    simd/type.h
-    unary_ops.h
-    binary_ops.h)
-
-if(MSVC)
-  set(SHELL_EXT ps1)
-  set(SHELL_CMD powershell -ExecutionPolicy Bypass -File)
-else()
-  set(SHELL_EXT sh)
-  set(SHELL_CMD bash)
-endif()
-
-add_custom_command(
-  OUTPUT compiled_preamble.cpp
-  COMMAND
-    ${SHELL_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.${SHELL_EXT}
-    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
-    ${PROJECT_SOURCE_DIR} ${CLANG} ${CMAKE_SYSTEM_PROCESSOR}
-  DEPENDS make_compiled_preamble.${SHELL_EXT} compiled_preamble.h
-          ${COMPILE_DEPS})
-
-add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)
-
-add_dependencies(mlx cpu_compiled_preamble)
-
-target_sources(
-  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cblas.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
-          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)
-
-if(MLX_BUILD_ACCELERATE)
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/bnns.cpp)
-else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_fp16.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_bf16.cpp)
-endif()
-
-if(IOS)
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../no_cpu/compiled.cpp)
-else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/jit_compiler.cpp)
-endif()
--- a/mlx/backend/cpu/binary.h
+++ b/mlx/backend/cpu/binary.h
@@ -1,370 +0,0 @@
-// Copyright © 2023 Apple Inc.
-
-#pragma once
-#include <cassert>
-
-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/common/binary.h"
-#include "mlx/backend/common/utils.h"
-
-#include "mlx/backend/cpu/simd/simd.h"
-
-namespace mlx::core {
-
-template <typename Op>
-struct VectorScalar {
-  Op op;
-
-  VectorScalar(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    T scalar = *b;
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::Simd<T, N>(scalar)));
-      dst += N;
-      a += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(*a, scalar);
-      dst++;
-      a++;
-    }
-  }
-};
-
-template <typename Op>
-struct ScalarVector {
-  Op op;
-
-  ScalarVector(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    T scalar = *a;
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::Simd<T, N>(scalar), simd::load<T, N>(b)));
-      dst += N;
-      b += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(scalar, *b);
-      dst++;
-      b++;
-    }
-  }
-};
-
-template <typename Op>
-struct VectorVector {
-  Op op;
-
-  VectorVector(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::load<T, N>(b)));
-      dst += N;
-      a += N;
-      b += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(*a, *b);
-      dst++;
-      a++;
-      b++;
-    }
-  }
-};
-
-template <typename T, typename U, typename Op, int D, bool Strided>
-void binary_op_dims(
-    const T* a,
-    const T* b,
-    U* out,
-    Op op,
-    const Shape& shape,
-    const Strides& a_strides,
-    const Strides& b_strides,
-    const Strides& out_strides,
-    int axis) {
-  auto stride_a = a_strides[axis];
-  auto stride_b = b_strides[axis];
-  auto stride_out = out_strides[axis];
-  auto N = shape[axis];
-
-  for (int i = 0; i < N; i++) {
-    if constexpr (D > 1) {
-      binary_op_dims<T, U, Op, D - 1, Strided>(
-          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
-    } else {
-      if constexpr (Strided) {
-        op(a, b, out, stride_out);
-      } else {
-        *out = op(*a, *b);
-      }
-    }
-    out += stride_out;
-    a += stride_a;
-    b += stride_b;
-  }
-}
-
-template <typename T, typename U, bool Strided, typename Op>
-void binary_op_dispatch_dims(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    int dim,
-    const Shape& shape,
-    const Strides& a_strides,
-    const Strides& b_strides,
-    const Strides& out_strides) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* out_ptr = out.data<U>();
-  switch (dim) {
-    case 1:
-      binary_op_dims<T, U, Op, 1, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-    case 2:
-      binary_op_dims<T, U, Op, 2, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-    case 3:
-      binary_op_dims<T, U, Op, 3, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-  }
-
-  ContiguousIterator a_it(shape, a_strides, dim - 3);
-  ContiguousIterator b_it(shape, b_strides, dim - 3);
-  auto stride = out_strides[dim - 4];
-  for (int64_t elem = 0; elem < a.size(); elem += stride) {
-    binary_op_dims<T, U, Op, 3, Strided>(
-        a_ptr + a_it.loc,
-        b_ptr + b_it.loc,
-        out_ptr + elem,
-        op,
-        shape,
-        a_strides,
-        b_strides,
-        out_strides,
-        dim - 3);
-    a_it.step();
-    b_it.step();
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  // The full computation is scalar scalar so call the base op once
-  if (bopt == BinaryOpType::ScalarScalar) {
-    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());
-    return;
-  }
-
-  // The full computation is scalar vector so delegate to the op
-  if (bopt == BinaryOpType::ScalarVector) {
-    ScalarVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());
-    return;
-  }
-
-  // The full computation is vector scalar so delegate to the op
-  if (bopt == BinaryOpType::VectorScalar) {
-    VectorScalar{op}(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());
-    return;
-  }
-
-  // The full computation is vector vector so delegate to the op
-  if (bopt == BinaryOpType::VectorVector) {
-    VectorVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), out.size());
-    return;
-  }
-
-  // General computation so let's try to optimize
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      a.shape(), {a.strides(), b.strides(), out.strides()});
-  const auto& a_strides = new_strides[0];
-  const auto& b_strides = new_strides[1];
-  const auto& strides = new_strides[2];
-
-  // Get the left-most dim such that the array is row contiguous after
-  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
-    int d = arr_strides.size() - 1;
-    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
-    }
-    return d + 1;
-  };
-  auto a_rc_dim = leftmost_rc_dim(a_strides);
-  auto b_rc_dim = leftmost_rc_dim(b_strides);
-
-  // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const auto& arr_strides) {
-    int d = arr_strides.size() - 1;
-    for (; d >= 0 && arr_strides[d] == 0; d--) {
-    }
-    return d + 1;
-  };
-  auto a_s_dim = leftmost_s_dim(a_strides);
-  auto b_s_dim = leftmost_s_dim(b_strides);
-
-  auto ndim = new_shape.size();
-
-  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
-  int dim = ndim;
-  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::VectorVector;
-    dim = d;
-    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
-    bopt = BinaryOpType::VectorScalar;
-    dim = d;
-    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::ScalarVector;
-    dim = d;
-  }
-
-  // Can be sure dim > 0 since otherwise we would have used one of the fully
-  // contiguous methods above. Except for the case that the flags do not
-  // correspond to the underlying contiguity.
-  if (dim == 0 || strides[dim - 1] < 16) {
-    bopt = BinaryOpType::General;
-    dim = ndim;
-  }
-
-  switch (bopt) {
-    case BinaryOpType::VectorVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorVector{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    case BinaryOpType::VectorScalar:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorScalar{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    case BinaryOpType::ScalarVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          ScalarVector{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    default:
-      binary_op_dispatch_dims<T, U, false>(
-          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
-      break;
-  }
-}
-
-template <typename T, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  binary_op<T, T>(a, b, out, op);
-}
-
-template <typename Op>
-void binary(const array& a, const array& b, array& out, Op op) {
-  switch (out.dtype()) {
-    case bool_:
-      binary_op<bool>(a, b, out, op);
-      break;
-    case uint8:
-      binary_op<uint8_t>(a, b, out, op);
-      break;
-    case uint16:
-      binary_op<uint16_t>(a, b, out, op);
-      break;
-    case uint32:
-      binary_op<uint32_t>(a, b, out, op);
-      break;
-    case uint64:
-      binary_op<uint64_t>(a, b, out, op);
-      break;
-    case int8:
-      binary_op<int8_t>(a, b, out, op);
-      break;
-    case int16:
-      binary_op<int16_t>(a, b, out, op);
-      break;
-    case int32:
-      binary_op<int32_t>(a, b, out, op);
-      break;
-    case int64:
-      binary_op<int64_t>(a, b, out, op);
-      break;
-    case float16:
-      binary_op<float16_t>(a, b, out, op);
-      break;
-    case float32:
-      binary_op<float>(a, b, out, op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, out, op);
-      break;
-    case complex64:
-      binary_op<complex64_t>(a, b, out, op);
-      break;
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/binary_ops.h
+++ b/mlx/backend/cpu/binary_ops.h
@@ -1,98 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cpu/simd/simd.h"
-
-namespace mlx::core::detail {
-
-using namespace mlx::core::simd;
-
-#define BINARY_SINGLE()                                 \
-  template <typename T>                                 \
-  T operator()(T x, T y) {                              \
-    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value; \
-  }
-
-#define DEFAULT_BINARY_OP(Op, op)                       \
-  struct Op {                                           \
-    template <int N, typename T>                        \
-    Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
-      return op(x, y);                                  \
-    }                                                   \
-    BINARY_SINGLE()                                     \
-  };
-
-DEFAULT_BINARY_OP(Add, operator+)
-DEFAULT_BINARY_OP(ArcTan2, atan2)
-DEFAULT_BINARY_OP(Divide, operator/)
-DEFAULT_BINARY_OP(Multiply, operator*)
-DEFAULT_BINARY_OP(Subtract, operator-)
-DEFAULT_BINARY_OP(LogicalAnd, operator&&)
-DEFAULT_BINARY_OP(LogicalOr, operator||)
-DEFAULT_BINARY_OP(BitwiseAnd, operator&)
-DEFAULT_BINARY_OP(BitwiseOr, operator|)
-DEFAULT_BINARY_OP(BitwiseXor, operator^)
-DEFAULT_BINARY_OP(LeftShift, operator<<)
-DEFAULT_BINARY_OP(RightShift, operator>>)
-DEFAULT_BINARY_OP(Remainder, remainder)
-DEFAULT_BINARY_OP(Maximum, maximum)
-DEFAULT_BINARY_OP(Minimum, minimum)
-DEFAULT_BINARY_OP(Power, pow)
-
-#define DEFAULT_BOOL_OP(Op, op)                            \
-  struct Op {                                              \
-    template <int N, typename T>                           \
-    Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
-      return op(x, y);                                     \
-    }                                                      \
-    template <typename T>                                  \
-    bool operator()(T x, T y) {                            \
-      return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;  \
-    }                                                      \
-  };
-
-DEFAULT_BOOL_OP(Equal, operator==)
-DEFAULT_BOOL_OP(Greater, operator>)
-DEFAULT_BOOL_OP(GreaterEqual, operator>=)
-DEFAULT_BOOL_OP(Less, operator<)
-DEFAULT_BOOL_OP(LessEqual, operator<=)
-DEFAULT_BOOL_OP(NotEqual, operator!=)
-
-struct NaNEqual {
-  template <int N, typename T>
-  Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) {
-    return x == y || (isnan(x) && isnan(y));
-  }
-  template <typename T>
-  bool operator()(T x, T y) {
-    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;
-  }
-};
-
-struct LogAddExp {
-  template <int N, typename T>
-  Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) {
-    auto maxval = maximum(x, y);
-    auto minval = minimum(x, y);
-    auto mask = minval == -inf || maxval == inf;
-    auto out = maxval + log1p(exp(minval - maxval));
-    return select(mask, Simd<T, N>(maxval), Simd<T, N>(out));
-  }
-  BINARY_SINGLE()
-};
-
-struct Select {
-  template <typename T>
-  T operator()(bool condition, T x, T y) {
-    return (*this)(Simd<bool, 1>(condition), Simd<T, 1>(x), Simd<T, 1>(y))
-        .value;
-  }
-
-  template <int N, typename T>
-  Simd<T, N> operator()(Simd<bool, N> condition, Simd<T, N> x, Simd<T, N> y) {
-    return select(condition, x, y);
-  }
-};
-
-} // namespace mlx::core::detail
--- a/mlx/backend/cpu/copy.h
+++ b/mlx/backend/cpu/copy.h
@@ -1,24 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#pragma once
-
-#include "mlx/array.h"
-#include "mlx/backend/common/copy.h"
-#include "mlx/backend/common/utils.h"
-
-namespace mlx::core {
-
-void copy(const array& src, array& dst, CopyType ctype);
-void copy_inplace(const array& src, array& dst, CopyType ctype);
-
-void copy_inplace(
-    const array& src,
-    array& dst,
-    const Shape& data_shape,
-    const Strides& i_strides,
-    const Strides& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemm.h
+++ b/mlx/backend/cpu/gemm.h
@@ -1,20 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#pragma once
-#include "mlx/array.h"
-
-namespace mlx::core {
-
-template <typename T>
-void matmul(
-    const array& a,
-    const array& b,
-    array& out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    float alpha,
-    float beta);
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/bnns.cpp
+++ b/mlx/backend/cpu/gemms/bnns.cpp
@@ -1,157 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include <Accelerate/Accelerate.h>
-
-#include "mlx/array.h"
-#include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/gemm.h"
-#include "mlx/dtype.h"
-
-namespace mlx::core {
-
-BNNSDataType to_bnns_dtype(Dtype mlx_dtype) {
-  uint32_t size_bits = size_of(mlx_dtype) * 8;
-  switch (kindof(mlx_dtype)) {
-    case Dtype::Kind::b:
-      return BNNSDataTypeBoolean;
-    case Dtype::Kind::u:
-      return BNNSDataType(BNNSDataTypeUIntBit | size_bits);
-    case Dtype::Kind::i:
-      return BNNSDataType(BNNSDataTypeIntBit | size_bits);
-    case Dtype::Kind::f:
-      return BNNSDataType(BNNSDataTypeFloatBit | size_bits);
-    case Dtype::Kind::V:
-      return BNNSDataTypeBFloat16;
-    case Dtype::Kind::c:
-      throw std::invalid_argument("BNNS does not support complex types");
-  }
-}
-
-void matmul_bnns(
-    const array& a,
-    const array& b,
-    array& out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    float alpha,
-    float beta) {
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
-
-  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());
-
-  const BNNSLayerParametersBroadcastMatMul gemm_params{
-      /* float alpha = */ alpha,
-      /* float beta = */ beta,
-      /* bool transA = */ a_transposed,
-      /* bool transB = */ b_transposed,
-      /* bool quadratic = */ false,
-      /* bool a_is_weights = */ false,
-      /* bool b_is_weights = */ false,
-      /* BNNSNDArrayDescriptor iA_desc = */
-      BNNSNDArrayDescriptor{
-          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
-          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
-
-          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
-          {lda, (M * K) / lda, 0, 0, 0, 0, 0, 0},
-          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
-          {1, lda, 0, 0, 0, 0, 0, 0},
-
-          /* void * _Nullable data = */ nullptr,
-          /* BNNSDataType data_type = */ bnns_dtype,
-
-          /* void * _Nullable table_data = */ nullptr,
-          /* BNNSDataType table_data_type = */ bnns_dtype,
-
-          /* float data_scale = */ 1.0,
-          /* float data_bias = */ 0.0,
-      },
-      /* BNNSNDArrayDescriptor iB_desc = */
-      BNNSNDArrayDescriptor{
-          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
-          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
-
-          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
-          {ldb, (K * N) / ldb, 0, 0, 0, 0, 0, 0},
-          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
-          {1, ldb, 0, 0, 0, 0, 0, 0},
-
-          /* void * _Nullable data = */ nullptr,
-          /* BNNSDataType data_type = */ bnns_dtype,
-
-          /* void * _Nullable table_data = */ nullptr,
-          /* BNNSDataType table_data_type = */ bnns_dtype,
-
-          /* float data_scale = */ 1.0,
-          /* float data_bias = */ 0.0,
-      },
-      /* BNNSNDArrayDescriptor o_desc = */
-      BNNSNDArrayDescriptor{
-          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
-          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
-
-          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
-          {N, M, 0, 0, 0, 0, 0, 0},
-          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
-          {1, N, 0, 0, 0, 0, 0, 0},
-
-          /* void * _Nullable data = */ nullptr,
-          /* BNNSDataType data_type = */ bnns_dtype,
-
-          /* void * _Nullable table_data = */ nullptr,
-          /* BNNSDataType table_data_type = */ bnns_dtype,
-
-          /* float data_scale = */ 1.0,
-          /* float data_bias = */ 0.0,
-      },
-  };
-
-  auto bnns_filter =
-      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);
-
-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
-    BNNSFilterApplyTwoInput(
-        bnns_filter,
-        a.data<uint8_t>() +
-            elem_to_loc(M * K * i, a.shape(), a.strides()) * a.itemsize(),
-        b.data<uint8_t>() +
-            elem_to_loc(K * N * i, b.shape(), b.strides()) * b.itemsize(),
-        out.data<uint8_t>() + M * N * i * out.itemsize());
-  }
-
-  BNNSFilterDestroy(bnns_filter);
-}
-
-template <>
-void matmul<float16_t>(
-    const array& a,
-    const array& b,
-    array& out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    float alpha,
-    float beta) {
-  matmul_bnns(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
-}
-
-template <>
-void matmul<bfloat16_t>(
-    const array& a,
-    const array& b,
-    array& out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    float alpha,
-    float beta) {
-  matmul_bnns(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/cblas.cpp
+++ b/mlx/backend/cpu/gemms/cblas.cpp
@@ -1,44 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/gemm.h"
-#include "mlx/backend/cpu/lapack.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<float>(
-    const array& a,
-    const array& b,
-    array& out,
-    bool a_transposed,
-    bool b_transposed,
-    size_t lda,
-    size_t ldb,
-    float alpha,
-    float beta) {
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
-
-  for (int i = 0; i < (a.size() / (M * K)); ++i) {
-    cblas_sgemm(
-        CblasRowMajor,
-        a_transposed ? CblasTrans : CblasNoTrans, // transA
-        b_transposed ? CblasTrans : CblasNoTrans, // transB
-        M,
-        N,
-        K,
-        alpha, // alpha
-        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
-        lda,
-        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
-        ldb,
-        beta, // beta
-        out.data<float>() + M * N * i,
-        out.shape(-1) // ldc
-    );
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/no_bf16.cpp
+++ b/mlx/backend/cpu/gemms/no_bf16.cpp
@@ -1,21 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<bfloat16_t>(
-    const array&,
-    const array&,
-    array&,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    float,
-    float) {
-  throw std::runtime_error("[Matmul::eval_cpu] bfloat16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/gemms/no_fp16.cpp
+++ b/mlx/backend/cpu/gemms/no_fp16.cpp
@@ -1,21 +0,0 @@
-// Copyright © 2025 Apple Inc.
-
-#include "mlx/backend/cpu/gemm.h"
-
-namespace mlx::core {
-
-template <>
-void matmul<float16_t>(
-    const array&,
-    const array&,
-    array&,
-    bool,
-    bool,
-    size_t,
-    size_t,
-    float,
-    float) {
-  throw std::runtime_error("[Matmul::eval_cpu] float16 not supported.");
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/jit_compiler.cpp
+++ b/mlx/backend/cpu/jit_compiler.cpp
@@ -1,152 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include "mlx/backend/cpu/jit_compiler.h"
-
-#include <sstream>
-#include <vector>
-
-#include <fmt/format.h>
-
-namespace mlx::core {
-
-#ifdef _MSC_VER
-
-namespace {
-
-// Split string into array.
-std::vector<std::string> str_split(const std::string& str, char delimiter) {
-  std::vector<std::string> tokens;
-  std::string token;
-  std::istringstream tokenStream(str);
-  while (std::getline(tokenStream, token, delimiter)) {
-    tokens.push_back(token);
-  }
-  return tokens;
-}
-
-// Get path information about MSVC.
-struct VisualStudioInfo {
-  VisualStudioInfo() {
-#ifdef _M_ARM64
-    arch = "arm64";
-#else
-    arch = "x64";
-#endif
-    // Get path of Visual Studio.
-    std::string vs_path = JitCompiler::exec(fmt::format(
-        "\"{0}\\Microsoft Visual Studio\\Installer\\vswhere.exe\""
-        " -property installationPath",
-        std::getenv("ProgramFiles(x86)")));
-    if (vs_path.empty()) {
-      throw std::runtime_error("Can not find Visual Studio.");
-    }
-    // Read the envs from vcvarsall.
-    std::string envs = JitCompiler::exec(fmt::format(
-        "\"{0}\\VC\\Auxiliary\\Build\\vcvarsall.bat\" {1} >NUL && set",
-        vs_path,
-        arch));
-    for (const std::string& line : str_split(envs, '\n')) {
-      // Each line is in the format "ENV_NAME=values".
-      auto pos = line.find_first_of('=');
-      if (pos == std::string::npos || pos == 0 || pos == line.size() - 1)
-        continue;
-      std::string name = line.substr(0, pos);
-      std::string value = line.substr(pos + 1);
-      if (name == "LIB") {
-        libpaths = str_split(value, ';');
-      } else if (name == "VCToolsInstallDir") {
-        cl_exe = fmt::format("{0}\\bin\\Host{1}\\{1}\\cl.exe", value, arch);
-      }
-    }
-  }
-  std::string arch;
-  std::string cl_exe;
-  std::vector<std::string> libpaths;
-};
-
-const VisualStudioInfo& GetVisualStudioInfo() {
-  static VisualStudioInfo info;
-  return info;
-}
-
-} // namespace
-
-#endif // _MSC_VER
-
-std::string JitCompiler::build_command(
-    const std::filesystem::path& dir,
-    const std::string& source_file_name,
-    const std::string& shared_lib_name) {
-#ifdef _MSC_VER
-  const VisualStudioInfo& info = GetVisualStudioInfo();
-  std::string libpaths;
-  for (const std::string& lib : info.libpaths) {
-    libpaths += fmt::format(" /libpath:\"{0}\"", lib);
-  }
-  return fmt::format(
-      "\""
-      "cd /D \"{0}\" && "
-      "\"{1}\" /LD /EHsc /MD /Ox /nologo /std:c++17 \"{2}\" "
-      "/link /out:\"{3}\" {4} 2>&1"
-      "\"",
-      dir.string(),
-      info.cl_exe,
-      source_file_name,
-      shared_lib_name,
-      libpaths);
-#else
-  return fmt::format(
-      "g++ -std=c++17 -O3 -Wall -fPIC -shared \"{0}\" -o \"{1}\" 2>&1",
-      (dir / source_file_name).string(),
-      (dir / shared_lib_name).string());
-#endif
-}
-
-std::string JitCompiler::exec(const std::string& cmd) {
-#ifdef _MSC_VER
-  FILE* pipe = _popen(cmd.c_str(), "r");
-#else
-  FILE* pipe = popen(cmd.c_str(), "r");
-#endif
-  if (!pipe) {
-    throw std::runtime_error("popen() failed.");
-  }
-  char buffer[128];
-  std::string ret;
-  while (fgets(buffer, sizeof(buffer), pipe)) {
-    ret += buffer;
-  }
-  // Trim trailing spaces.
-  ret.erase(
-      std::find_if(
-          ret.rbegin(),
-          ret.rend(),
-          [](unsigned char ch) { return !std::isspace(ch); })
-          .base(),
-      ret.end());
-
-#ifdef _MSC_VER
-  int status = _pclose(pipe);
-#else
-  int status = pclose(pipe);
-#endif
-  if (status == -1) {
-    throw std::runtime_error("pclose() failed.");
-  }
-#if defined(_WIN32) || defined(__FreeBSD__)
-  int code = status;
-#else
-  int code = WEXITSTATUS(status);
-#endif
-  if (code != 0) {
-    throw std::runtime_error(fmt::format(
-        "Failed to execute command with return code {0}: \"{1}\", "
-        "the output is: {2}",
-        code,
-        cmd,
-        ret));
-  }
-  return ret;
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/jit_compiler.h
+++ b/mlx/backend/cpu/jit_compiler.h
@@ -1,20 +0,0 @@
-// Copyright © 2024 Apple Inc.
-#pragma once
-
-#include <filesystem>
-
-namespace mlx::core {
-
-class JitCompiler {
- public:
-  // Build a shell command that compiles a source code file to a shared library.
-  static std::string build_command(
-      const std::filesystem::path& dir,
-      const std::string& source_file_name,
-      const std::string& shared_lib_name);
-
-  // Run a command and get its output.
-  static std::string exec(const std::string& cmd);
-};
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/matmul.cpp
+++ b/mlx/backend/cpu/matmul.cpp
@@ -1,79 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include <cstring>
-#include "mlx/array.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/gemm.h"
-#include "mlx/primitives.h"
-
-namespace mlx::core {
-
-void matmul_general(
-    const array& a_pre,
-    const array& b_pre,
-    array& out,
-    float alpha = 1.0f,
-    float beta = 0.0f) {
-  auto check_transpose = [](const array& arr) {
-    auto stx = arr.strides()[arr.ndim() - 2];
-    auto sty = arr.strides()[arr.ndim() - 1];
-    if (stx == arr.shape(-1) && sty == 1) {
-      return std::make_tuple(false, stx, arr);
-    } else if (stx == 1 && sty == arr.shape(-2)) {
-      return std::make_tuple(true, sty, arr);
-    } else {
-      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
-      copy(arr, arr_copy, CopyType::General);
-      stx = arr.shape(-1);
-      return std::make_tuple(false, stx, arr_copy);
-    }
-  };
-
-  auto [a_transposed, lda, a] = check_transpose(a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(b_pre);
-  size_t M = a.shape(-2);
-  size_t N = b.shape(-1);
-  size_t K = a.shape(-1);
-  if (M == 0 || N == 0) {
-    return;
-  }
-
-  if (out.dtype() == float32) {
-    matmul<float>(a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
-  } else if (out.dtype() == float16) {
-    matmul<float16_t>(
-        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
-  } else if (out.dtype() == bfloat16) {
-    matmul<bfloat16_t>(
-        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta);
-  } else {
-    throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
-  }
-}
-
-void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  if (inputs[0].shape(-1) == 0) {
-    std::memset(out.data<void>(), 0, out.nbytes());
-    return;
-  }
-  return matmul_general(inputs[0], inputs[1], out);
-}
-
-void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.dtype() != float32) {
-    throw std::runtime_error(
-        "[AddMM::eval_cpu] Currently only supports float32.");
-  }
-
-  // Fill output with C
-  auto& c = inputs[2];
-  CopyType ctype = c.data_size() == 1
-      ? CopyType::Scalar
-      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
-  copy(c, out, ctype);
-
-  return matmul_general(inputs[0], inputs[1], out, alpha_, beta_);
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/primitives.cpp
+++ b/mlx/backend/cpu/primitives.cpp
@@ -1,389 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <numeric>
-#include <sstream>
-
-#include "mlx/allocator.h"
-#include "mlx/backend/common/load.h"
-#include "mlx/backend/common/slicing.h"
-#include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/arange.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/threefry.h"
-#include "mlx/primitives.h"
-#include "mlx/utils.h"
-
-namespace mlx::core {
-
-void reshape(const array& in, array& out) {
-  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
-  if (copy_necessary) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    copy_inplace(in, out, CopyType::General);
-  } else {
-    shared_buffer_reshape(in, out_strides, out);
-  }
-}
-
-int64_t compute_dynamic_offset(
-    const array& indices,
-    const Strides& strides,
-    const std::vector<int>& axes) {
-  auto compute_offset = [&strides, &axes](const auto* indices) {
-    int64_t offset = 0;
-    for (int i = 0; i < axes.size(); ++i) {
-      offset += indices[i] * strides[axes[i]];
-    }
-    return offset;
-  };
-  switch (indices.dtype()) {
-    case int8:
-    case uint8:
-      return compute_offset(indices.data<uint8_t>());
-    case int16:
-    case uint16:
-      return compute_offset(indices.data<uint16_t>());
-    case int32:
-    case uint32:
-      return compute_offset(indices.data<uint32_t>());
-    case int64:
-    case uint64:
-      return compute_offset(indices.data<uint64_t>());
-    default:
-      throw std::runtime_error("Invalid indices type.");
-  }
-}
-
-void AsStrided::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void Broadcast::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void BroadcastAxes::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void Copy::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void CustomTransforms::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  eval(inputs, outputs);
-}
-void Depends::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  eval(inputs, outputs);
-}
-void ExpandDims::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void NumberOfElements::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void Slice::eval_cpu(const std::vector<array>& inputs, array& out) {
-  slice(inputs[0], out, start_indices_, strides_);
-}
-void Split::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
-  eval(inputs, outputs);
-}
-void Squeeze::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void StopGradient::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-void Transpose::eval_cpu(const std::vector<array>& inputs, array& out) {
-  eval(inputs, out);
-}
-
-void Arange::eval_cpu(const std::vector<array>& inputs, array& out) {
-  arange(inputs, out, start_, step_);
-}
-
-void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
-  copy(in, out, ctype);
-}
-
-void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
-  std::vector<int> sizes;
-  sizes.push_back(0);
-  for (auto& p : inputs) {
-    sizes.push_back(p.shape(axis_));
-  }
-  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
-
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  auto strides = out.strides();
-  auto flags = out.flags();
-  flags.row_contiguous = false;
-  flags.col_contiguous = false;
-  flags.contiguous = false;
-  for (int i = 0; i < inputs.size(); i++) {
-    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
-    size_t data_offset = strides[axis_] * sizes[i];
-    out_slice.copy_shared_buffer(
-        out, strides, flags, out_slice.size(), data_offset);
-    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral);
-  }
-}
-
-void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-  if (in.flags().row_contiguous ||
-      (allow_col_major_ && in.flags().col_contiguous)) {
-    out.copy_shared_buffer(in);
-  } else {
-    copy(in, out, CopyType::General);
-  }
-}
-
-void Flatten::eval_cpu(const std::vector<array>& inputs, array& out) {
-  reshape(inputs[0], out);
-}
-
-void Unflatten::eval_cpu(const std::vector<array>& inputs, array& out) {
-  reshape(inputs[0], out);
-}
-
-void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-  assert(in.dtype() == out.dtype());
-  CopyType ctype;
-  if (in.data_size() == 1) {
-    ctype = CopyType::Scalar;
-  } else if (in.flags().contiguous) {
-    ctype = CopyType::Vector;
-  } else {
-    ctype = CopyType::General;
-  }
-  copy(in, out, ctype);
-}
-
-void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 0);
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  load(out, offset_, reader_, swap_endianness_);
-}
-
-void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
-  // Inputs must be base input array and scalar val array
-  assert(inputs.size() == 2);
-  auto& in = inputs[0];
-  auto& val = inputs[1];
-
-  // Padding value must be a scalar
-  assert(val.size() == 1);
-
-  // Padding value, input and output must be of the same type
-  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());
-
-  // Fill output with val
-  copy(val, out, CopyType::Scalar);
-
-  // Find offset for start of input values
-  size_t data_offset = 0;
-  for (int i = 0; i < axes_.size(); i++) {
-    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
-    data_offset += out.strides()[ax] * low_pad_size_[i];
-  }
-
-  // Extract slice from output where input will be pasted
-  array out_slice(in.shape(), out.dtype(), nullptr, {});
-  out_slice.copy_shared_buffer(
-      out, out.strides(), out.flags(), out_slice.size(), data_offset);
-
-  // Copy input values into the slice
-  copy_inplace(in, out_slice, CopyType::GeneralGeneral);
-}
-
-void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  // keys has shape (N1, ..., NK, 2)
-  // out has shape (N1, ..., NK, M1, M2, ...)
-  auto& keys = inputs[0];
-  size_t num_keys = keys.size() / 2;
-
-  size_t elems_per_key = out.size() / num_keys;
-  size_t bytes_per_key = out.itemsize() * elems_per_key;
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  auto kptr = inputs[0].data<uint32_t>();
-  auto cptr = out.data<char>();
-  size_t out_skip = (bytes_per_key + 4 - 1) / 4;
-  auto half_size = out_skip / 2;
-  bool even = out_skip % 2 == 0;
-  for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
-    auto ptr = reinterpret_cast<uint32_t*>(cptr);
-    // Get ith key
-    auto kidx = 2 * i;
-    auto k1_elem = elem_to_loc(kidx, keys.shape(), keys.strides());
-    auto k2_elem = elem_to_loc(kidx + 1, keys.shape(), keys.strides());
-    auto key = std::make_pair(kptr[k1_elem], kptr[k2_elem]);
-
-    std::pair<uintptr_t, uintptr_t> count{0, half_size + !even};
-    for (; count.first + 1 < half_size; count.first++, count.second++) {
-      std::tie(ptr[count.first], ptr[count.second]) =
-          random::threefry2x32_hash(key, count);
-    }
-    if (count.first < half_size) {
-      auto rb = random::threefry2x32_hash(key, count);
-      ptr[count.first++] = rb.first;
-      if (bytes_per_key % 4 > 0) {
-        std::copy(
-            reinterpret_cast<char*>(&rb.second),
-            reinterpret_cast<char*>(&rb.second) + bytes_per_key % 4,
-            cptr + 4 * count.second);
-      } else {
-        ptr[count.second] = rb.second;
-      }
-    }
-    if (!even) {
-      count.second = 0;
-      ptr[half_size] = random::threefry2x32_hash(key, count).first;
-    }
-  }
-}
-
-void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
-  reshape(inputs[0], out);
-}
-
-void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-  auto& in = inputs[0];
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  auto i_offset = compute_dynamic_offset(inputs[1], in.strides(), axes_);
-  copy_inplace(
-      /* const array& src = */ in,
-      /* array& dst = */ out,
-      /* const Shape& data_shape = */ out.shape(),
-      /* const Strides& i_strides = */ in.strides(),
-      /* const Strides& o_strides = */ out.strides(),
-      /* int64_t i_offset = */ i_offset,
-      /* int64_t o_offset = */ 0,
-      /* CopyType ctype = */ CopyType::GeneralGeneral);
-}
-
-void DynamicSliceUpdate::eval_cpu(
-    const std::vector<array>& inputs,
-    array& out) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-
-  auto& in = inputs[0];
-  auto& upd = inputs[1];
-
-  // Copy or move src to dst
-  auto ctype = in.flags().contiguous && in.size() == in.data_size()
-      ? CopyType::Vector
-      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);
-
-  auto o_offset = compute_dynamic_offset(inputs[2], out.strides(), axes_);
-  copy_inplace(
-      /* const array& src = */ upd,
-      /* array& dst = */ out,
-      /* const std::vector<int>& data_shape = */ upd.shape(),
-      /* const std::vector<stride_t>& i_strides = */ upd.strides(),
-      /* const std::vector<stride_t>& o_strides = */ out.strides(),
-      /* int64_t i_offset = */ 0,
-      /* int64_t o_offset = */ o_offset,
-      /* CopyType ctype = */ CopyType::GeneralGeneral);
-}
-
-void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-
-  auto& in = inputs[0];
-  auto& upd = inputs[1];
-
-  if (upd.size() == 0) {
-    out.copy_shared_buffer(in);
-    return;
-  }
-
-  // Check if materialization is needed
-  auto ctype = in.flags().contiguous && in.size() == in.data_size()
-      ? CopyType::Vector
-      : CopyType::General;
-  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);
-
-  // Calculate out strides, initial offset and if copy needs to be made
-  auto [data_offset, out_strides] =
-      prepare_slice(out, start_indices_, strides_);
-
-  // Do copy
-  copy_inplace(
-      /* const array& src = */ upd,
-      /* array& dst = */ out,
-      /* const std::vector<int>& data_shape = */ upd.shape(),
-      /* const std::vector<stride_t>& i_strides = */ upd.strides(),
-      /* const std::vector<stride_t>& o_strides = */ out_strides,
-      /* int64_t i_offset = */ 0,
-      /* int64_t o_offset = */ data_offset,
-      /* CopyType ctype = */ CopyType::GeneralGeneral);
-}
-
-void View::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  auto& in = inputs[0];
-  auto ibytes = size_of(in.dtype());
-  auto obytes = size_of(out.dtype());
-  // Conditions for buffer copying (disjunction):
-  // - type size is the same
-  // - type size is smaller and the last axis is contiguous
-  // - the entire array is row contiguous
-  if (ibytes == obytes || (obytes < ibytes && in.strides().back() == 1) ||
-      in.flags().row_contiguous) {
-    auto strides = in.strides();
-    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
-      strides[i] *= ibytes;
-      strides[i] /= obytes;
-    }
-    out.copy_shared_buffer(
-        in, strides, in.flags(), in.data_size() * ibytes / obytes);
-  } else {
-    auto tmp = array(
-        in.shape(), in.dtype() == bool_ ? uint8 : in.dtype(), nullptr, {});
-    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
-    if (in.dtype() == bool_) {
-      auto in_tmp = array(in.shape(), uint8, nullptr, {});
-      in_tmp.copy_shared_buffer(in);
-      copy_inplace(in_tmp, tmp, CopyType::General);
-    } else {
-      copy_inplace(in, tmp, CopyType::General);
-    }
-
-    auto flags = out.flags();
-    flags.contiguous = true;
-    flags.row_contiguous = true;
-    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
-    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
-    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
-  }
-}
-
-} // namespace mlx::core
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	c02e14c264	Add the 3bit packed qmm_t	2024-12-17 22:16:30 -08:00
Angelos Katharopoulos	d75a509234	Add 3bit packed quants	2024-12-17 10:49:13 -08:00
Angelos Katharopoulos	14420949d2	Fix the optional in gather_qmm python binding	2024-12-16 22:14:19 -08:00
Angelos Katharopoulos	4847199ec6	Add the quantization type option to quantizable layers	2024-12-16 22:11:23 -08:00
Angelos Katharopoulos	fb7be036af	Add packed_affine_qmm_t	2024-12-16 21:49:14 -08:00
Angelos Katharopoulos	410ccdbed5	Change the argument name to quantization_type	2024-12-16 13:32:20 -08:00
Angelos Katharopoulos	f5da489a3c	Add some error reporting	2024-12-16 13:22:05 -08:00
Angelos Katharopoulos	c2e6d58441	Revert the change in packing order	2024-12-16 13:20:01 -08:00
Angelos Katharopoulos	17a1fa2f0b	Improve the benchmark	2024-12-14 23:04:29 -08:00
Angelos Katharopoulos	fd161aa31f	Change order in weight packing	2024-12-14 22:51:41 -08:00
Angelos Katharopoulos	bf6dc54110	Add the 2 bit vectorized reads	2024-12-14 21:19:02 -08:00
Angelos Katharopoulos	d7ed624502	Vectorized reads	2024-12-14 15:36:34 -08:00
Angelos Katharopoulos	05cb54ae3f	Another packing	2024-12-13 23:48:25 -08:00
Angelos Katharopoulos	cb358dbdda	Revert "Attempt different packing" This reverts commit `e4b587819c`.	2024-12-13 23:23:41 -08:00
Angelos Katharopoulos	e4b587819c	Attempt different packing	2024-12-13 18:36:36 -08:00
Angelos Katharopoulos	a06c968f4d	Add a small benchmark	2024-12-13 16:29:11 -08:00
Angelos Katharopoulos	651c510940	Working packed qmv	2024-12-13 16:29:11 -08:00
Angelos Katharopoulos	11ec07ff9d	Initial python binding	2024-12-13 16:29:11 -08:00
Angelos Katharopoulos	bdd68bd893	Add a quantization type in the ops	2024-12-13 16:29:11 -08:00