Bump the version to 0.2 (#656 )

Compile docs (#653 )
* compile docs * docs nits + comments
2025-09-10 21:37:50 +08:00 · 2024-02-08 11:27:12 -08:00 · 2024-02-08 11:21:50 -08:00 · 2024-02-08 09:01:59 -08:00 · 2024-02-07 17:29:22 -08:00 · 2024-02-07 13:22:27 -08:00
230 changed files with 23582 additions and 5851 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -26,18 +26,28 @@ jobs:
          command: |
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            sudo apt-get update
-            sudo apt-get install libblas-dev
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py build_ext --inplace
            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py develop
      - run:
-          name: Run the python tests
+          name: Generate package stubs
          command: |
-            python3 -m unittest discover python/tests
+            python3 setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            python3 -m unittest discover python/tests -v
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3 -m pip install . 
      - run:
          name: Build CPP only
          command: |
@@ -47,142 +57,116 @@ jobs:
          command: ./build/tests/tests

  mac_build_and_test:
-    machine: true
-    resource_class: ml-explore/m-builder
+    macos:
+      xcode: "15.2.0"
+    resource_class: macos.m1.large.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=3.9
-            conda activate runner-env
+            brew install python@3.9
+            python3.9 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install pybind11-stubgen
            pip install numpy
            pip install torch
+            pip install tensorflow
            pip install unittest-xml-reporting
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext --inplace
-            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py develop
+            source env/bin/activate
+            CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e . -v
      - run:
-          name: Run the python tests
+          name: Generate package stubs
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
-            DEVICE=gpu python -m xmlrunner discover -v python/tests -o test-results/gpu
+            source env/bin/activate
+            python setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source env/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
+            # TODO: Reenable when Circle CI can run gpu jobs
+            # DEVICE=gpu python3.9 -m xmlrunner discover -v python/tests -o test-results/gpu
+      # TODO: Reenable when extension api becomes stable
+      # - run:
+      #     name: Build example extension
+      #     command: |
+      #       cd examples/extensions && python3.11 -m pip install . 
      - store_test_results:
          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            source env/bin/activate
+            mkdir -p build && cd build && cmake .. && make -j
+      - run:
+          name: Run CPP tests
+          #command: METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
+          command: DEVICE=cpu ./build/tests/tests

  build_release:
-    machine: true
-    resource_class: ml-explore/m-builder
    parameters:
      python_version:
        type: string
        default: "3.9"
-      macos_version:
+      xcode_version:
        type: string
-        default: "14"
+        default: "15.2.0"
+      build_env:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: macos.m1.large.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
+            brew install python@<< parameters.python_version >>
+            python<< parameters.python_version >> -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
            pip install --upgrade cmake
            pip install --upgrade pybind11[global]
+            pip install --upgrade setuptools
+            pip install pybind11-stubgen
            pip install numpy
            pip install twine
+            pip install build
      - run:
-          name: Build pacakge
+          name: Install Python package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              PYPI_RELEASE=1 \
+            source env/bin/activate
+            DEV_RELEASE=1 \
              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
-            twine upload dist/* --repository mlx
-      - store_artifacts:
-          path: dist/
-
-  build_dev_release:
-    machine: true
-    resource_class: ml-explore/m-builder
-    parameters:
-      python_version:
-        type: string
-        default: "3.9"
-      macos_version:
-        type: string
-        default: "14"
-    steps:
-      - checkout
+              pip install . -v
      - run:
-          name: Install dependencies
+          name: Generate package stubs
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
-            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
-            pip install numpy
-            pip install twine
+            source env/bin/activate
+            python setup.py generate_stubs
      - run:
-          name: Build pacakge
+          name: Build Python package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              DEV_RELEASE=1 \
+            source env/bin/activate
+            << parameters.build_env >> \
              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
-            twine upload dist/* --repository mlx
-      - store_artifacts:
-          path: dist/
-
-  build_package:
-    machine: true
-    resource_class: ml-explore/m-builder
-    parameters:
-      python_version:
-        type: string
-        default: "3.9"
-      macos_version:
-        type: string
-        default: "14"
-    steps:
-      - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
-            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
-            pip install numpy
-            pip install twine
-      - run:
-          name: Build pacakge
-          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
+              python -m build -w
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  source env/bin/activate
+                  twine upload dist/*
      - store_artifacts:
          path: dist/

@@ -193,8 +177,8 @@ workflows:
        - not: << pipeline.parameters.nightly_build >>
        - not: << pipeline.parameters.weekly_build >>
    jobs:
-      - linux_build_and_test
      - mac_build_and_test
+      - linux_build_and_test
      - build_release:
          filters:
            tags:
@@ -204,20 +188,22 @@ workflows:
          matrix:
            parameters:
              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              xcode_version: ["14.3.1", "15.2.0"]
+              build_env: ["PYPI_RELEASE=1"]
  nightly_build:
    when: << pipeline.parameters.nightly_build >>
    jobs:
-      - build_package:
+      - build_release:
          matrix:
            parameters:
              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              xcode_version: ["14.3.1", "15.2.0"]
  weekly_build:
    when: << pipeline.parameters.weekly_build >>
    jobs:
-      - build_dev_release:
+      - build_release:
          matrix:
            parameters:
              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              xcode_version: ["14.3.1", "15.2.0"]
+              build_env: ["DEV_RELEASE=1"]
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,28 @@
+---
+name: Bug report
+about: Create a report about an issue you've encountered
+title: "[BUG] "
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+
+Include code snippet
+```python
+
+```
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Desktop (please complete the following information):**
+ - OS Version: [e.g. MacOS 14.1.2]
+ - Version [e.g. 0.7.0]
+
+**Additional context**
+Add any other context about the problem here.
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,10 @@ __pycache__/
 # C extensions
 *.so

+# tensor files
+*.safe
+*.safetensors
+
 # Metal libraries
 *.metallib
 venv/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,11 +5,11 @@ repos:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 22.10.0
+    rev: 23.12.1
    hooks:
    -   id: black
 -   repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
    hooks:
    -   id: isort
        args:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -6,9 +6,16 @@ with a short description of your contribution(s) below. For example:
 - Jane Smith: Added the `foo` and `bar` ops.

 MLX was developed with contributions from the following individuals:
-  
+
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops.
 - Juarez Bochi: Fixed bug in cross attention.
- Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, linear and logistic regression python example.
+- Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile` and safetensor support
+- Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer.
+
+<a href="https://github.com/ml-explore/mlx/graphs/contributors">
+  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
+</a>

 # Third-Party Software

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.24)

-project(mlx LANGUAGES CXX)
+project(mlx LANGUAGES C CXX)

 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -18,7 +18,7 @@ option(MLX_BUILD_METAL "Build metal backend" ON)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.0.6)
+  set(MLX_VERSION 0.2.0)
 endif()

 # --------------------- Processor tests -------------------------
@@ -29,9 +29,15 @@ set(MLX_BUILD_ARM OFF)

 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")

-  if (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64")
-    message(WARNING 
-      "Building for x86_64 on macOS is not supported." 
+  if (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64" AND ${CMAKE_HOST_APPLE})
+    message(FATAL_ERROR
+      "Building for x86_64 on macOS is not supported."
+      " If you are on an Apple silicon system, check the build"
+      " documentation for possible fixes: "
+      "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source")
+  elseif (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64")
+    message(WARNING
+      "Building for x86_64 on macOS is not supported."
      " If you are on an Apple silicon system, "
      " make sure you are building for arm64.")
  elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm64")
@@ -69,7 +75,7 @@ elseif (MLX_BUILD_METAL)
                  COMMAND_ERROR_IS_FATAL ANY)

  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
-  
+
  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
@@ -117,16 +123,27 @@ else()
    /usr/include
    /usr/local/include
    $ENV{BLAS_HOME}/include)
-  message(STATUS ${BLAS_LIBRARIES})
-  message(STATUS ${BLAS_INCLUDE_DIRS})
+  message(STATUS "Blas lib" ${BLAS_LIBRARIES})
+  message(STATUS "Blas include" ${BLAS_INCLUDE_DIRS})
  target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
  target_link_libraries(mlx ${BLAS_LIBRARIES})
+  find_package(LAPACK REQUIRED)
+  if (NOT LAPACK_FOUND)
+      message(FATAL_ERROR "Must have LAPACK installed")
+  endif()
+  find_path(LAPACK_INCLUDE_DIRS lapacke.h
+    /usr/include
+    /usr/local/include)
+  message(STATUS "Lapack lib" ${LAPACK_LIBRARIES})
+  message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+  target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+  target_link_libraries(mlx ${LAPACK_LIBRARIES})
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

 target_include_directories(
-  mlx 
+  mlx
  PUBLIC
  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
  $<INSTALL_INTERFACE:include>
@@ -152,6 +169,8 @@ if (MLX_BUILD_BENCHMARKS)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/benchmarks/cpp)
 endif()

+
+
 # ----------------------------- Installation -----------------------------
 include(GNUInstallDirs)

--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 include CMakeLists.txt
 recursive-include mlx/ *
 include python/src/*
+include python/mlx/py.typed # support type hinting as in PEP-561
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ variety of examples, including:

 - [Transformer language model](https://github.com/ml-explore/mlx-examples/tree/main/transformer_lm) training.
 - Large-scale text generation with
-  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llama) and
+  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llms/llama) and
  finetuning with [LoRA](https://github.com/ml-explore/mlx-examples/tree/main/lora).
 - Generating images with [Stable Diffusion](https://github.com/ml-explore/mlx-examples/tree/main/stable_diffusion).
 - Speech recognition with [OpenAI's Whisper](https://github.com/ml-explore/mlx-examples/tree/main/whisper).
@@ -61,17 +61,25 @@ variety of examples, including:
 ## Quickstart

 See the [quick start
-guide](https://ml-explore.github.io/mlx/build/html/quick_start.html)
+guide](https://ml-explore.github.io/mlx/build/html/usage/quick_start.html)
 in the documentation.

 ## Installation

 MLX is available on [PyPI](https://pypi.org/project/mlx/). To install the Python API, run:

+**With `pip`**:
+
 ```
 pip install mlx
 ```

+**With `conda`**:
+
+```
+conda install -c conda-forge mlx
+```
+
 Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.
@@ -85,7 +93,7 @@ information on building from source, and running tests.

 We are grateful for all of [our
 contributors](ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
-to MLX and wish to be acknowledged, please add your name to to the list in your
+to MLX and wish to be acknowledged, please add your name to the list in your
 pull request.

 ## Citing MLX
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -233,6 +233,20 @@ void time_gather_scatter() {
  TIME(single_element_add);
 }

+void time_divmod() {
+  auto a = random::normal({1000});
+  auto b = random::normal({1000});
+  eval({a, b});
+
+  auto divmod_fused = [&a, &b]() { return divmod(a, b); };
+  TIME(divmod_fused);
+
+  auto divmod_separate = [&a, &b]() {
+    return std::vector<array>{floor_divide(a, b), remainder(a, b)};
+  };
+  TIME(divmod_separate);
+}
+
 int main() {
  std::cout << "Benchmarks for " << default_device() << std::endl;
  time_creation_ops();
@@ -246,4 +260,5 @@ int main() {
  time_matmul();
  time_reductions();
  time_gather_scatter();
+  time_divmod();
 }
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -166,13 +166,13 @@ if __name__ == "__main__":
    dtypes = ("float32", "float16")
    transposes = ("nn", "nt", "tn")
    shapes = (
+        (16, 234, 768, 3072),
+        (1, 64, 64, 25344),
        (16, 1024, 1024, 1024),
        (1, 1024, 1024, 2048),
        (4, 1024, 1024, 4096),
        (4, 1024, 4096, 1024),
        (1, 4096, 4096, 4096),
-        (15, 1023, 1023, 1023),
-        (17, 1025, 1025, 1025),
    )

    for dtype in dtypes:
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -133,7 +133,7 @@ def get_gbyte_size(in_vec_len, out_vec_len, np_dtype):
    return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3)


-def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, tranpose):
+def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
@@ -164,7 +164,7 @@ def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, tranpose):
    ax.legend()


-def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, tranpose):
+def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
--- a/benchmarks/python/comparative/bench_mlx.py
+++ b/benchmarks/python/comparative/bench_mlx.py
@@ -4,6 +4,7 @@ import argparse
 import math
 import os
 import time
+from functools import partial

 import mlx.core as mx
 import mlx.nn as nn
@@ -59,15 +60,63 @@ def matmul(x, y):
    mx.eval(ys)


-def quant_matmul(x, w, s, b):
-    groups = x.shape[-1] // s.shape[-1]
-    width = 32 // (x.shape[-1] // w.shape[0])
+def _quant_matmul(x, w, s, b, transpose, group_size, bits):
    ys = []
    for i in range(10):
-        ys.append(mx.quantized_matmul(x, w, s, b, groups=groups, width=width))
+        ys.append(
+            mx.quantized_matmul(
+                x, w, s, b, transpose=transpose, group_size=group_size, bits=bits
+            )
+        )
    mx.eval(ys)


+quant_matmul = {
+    "quant_matmul_32_2": partial(_quant_matmul, transpose=False, group_size=32, bits=2),
+    "quant_matmul_32_4": partial(_quant_matmul, transpose=False, group_size=32, bits=4),
+    "quant_matmul_32_8": partial(_quant_matmul, transpose=False, group_size=32, bits=8),
+    "quant_matmul_64_2": partial(_quant_matmul, transpose=False, group_size=64, bits=2),
+    "quant_matmul_64_4": partial(_quant_matmul, transpose=False, group_size=64, bits=4),
+    "quant_matmul_64_8": partial(_quant_matmul, transpose=False, group_size=64, bits=8),
+    "quant_matmul_128_2": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=2
+    ),
+    "quant_matmul_128_4": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=4
+    ),
+    "quant_matmul_128_8": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=8
+    ),
+    "quant_matmul_t_32_2": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=2
+    ),
+    "quant_matmul_t_32_4": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=4
+    ),
+    "quant_matmul_t_32_8": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=8
+    ),
+    "quant_matmul_t_64_2": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=2
+    ),
+    "quant_matmul_t_64_4": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=4
+    ),
+    "quant_matmul_t_64_8": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=8
+    ),
+    "quant_matmul_t_128_2": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=2
+    ),
+    "quant_matmul_t_128_4": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=4
+    ),
+    "quant_matmul_t_128_8": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=8
+    ),
+}
+
+
 def conv1d(x, y):
    ys = []
    for i in range(10):
@@ -220,6 +269,13 @@ def linear(w, b, x):
    mx.eval(ys)


+def linear_fused(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.addmm(b, x, mx.transpose(w, (1, 0))))
+    mx.eval(ys)
+
+
 def rope(x):
    *_, N, D = x.shape
    ys = []
@@ -356,11 +412,14 @@ if __name__ == "__main__":
    elif args.benchmark == "matmul":
        print(bench(matmul, *xs))

-    elif args.benchmark == "quant_matmul":
-        print(bench(quant_matmul, *xs))
+    elif args.benchmark.startswith("quant_matmul"):
+        print(bench(quant_matmul[args.benchmark], *xs))

    elif args.benchmark == "linear":
-        print(bench(linear, *xs))
+        if args.fused:
+            print(bench(linear_fused, *xs))
+        else:
+            print(bench(linear, *xs))

    elif args.benchmark == "sum_axis":
        print(bench(reduction, "sum", axis, x))
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -62,7 +62,7 @@ def make_predicate(positive_filter, negative_filter):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run comparisons agains PyTorch")
+    parser = argparse.ArgumentParser(description="Run comparisons against PyTorch")
    parser.add_argument(
        "--filter", "-f", help="Regex filter to select benchmarks", nargs="+"
    )
@@ -125,6 +125,14 @@ if __name__ == "__main__":
    compare_filtered("sum_axis --size 16x128x1024 --axis 1")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1")
    compare_filtered("argmax --size 10x1024x128 --axis 1 --cpu")
    compare_filtered("argmax --size 10x1024x128 --axis 1")
    compare_filtered("argmax --size 10x1024x128 --axis 2 --cpu")
--- a/benchmarks/python/gather_bench.py
+++ b/benchmarks/python/gather_bench.py
@@ -0,0 +1,64 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import argparse
+from time import time
+
+import mlx.core as mx
+import torch
+
+
+def measure_runtime(fn, **kwargs):
+    # Warmup
+    for _ in range(5):
+        fn(**kwargs)
+
+    tic = time()
+    iters = 10
+    for _ in range(iters):
+        fn(**kwargs)
+    return (time() - tic) * 1000 / iters
+
+
+def benchmark_gather_mlx(x_shape, idx_shape):
+    def gather(x, idx):
+        mx.eval(x[idx])
+
+    idx = mx.random.randint(0, x_shape[0] - 1, idx_shape)
+    x = mx.random.normal(x_shape).astype(mx.float32)
+
+    runtime = measure_runtime(gather, x=x, idx=idx)
+    print(f"MLX: {runtime:.3f}ms")
+
+
+def benchmark_gather_torch(x_shape, idx_shape, device):
+    def gather(x, idx, device):
+        _ = x[idx]
+        if device == torch.device("mps"):
+            torch.mps.synchronize()
+
+    idx = torch.randint(0, x_shape[0] - 1, idx_shape).to(device)
+    x = torch.randn(x_shape, dtype=torch.float32).to(device)
+
+    runtime = measure_runtime(gather, x=x, idx=idx, device=device)
+    print(f"PyTorch: {runtime:.3f}ms")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Gather benchmarks.")
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
+    args = parser.parse_args()
+
+    if args.cpu:
+        mx.set_default_device(mx.cpu)
+        device = torch.device("cpu")
+    else:
+        device = torch.device("mps")
+
+    idx_shapes = [(1_000_000,), (100_000,), ()]
+    x_shapes = [(100, 64), (100, 1024), (4, 1_000_000)]
+
+    for x_shape, idx_shape in zip(x_shapes, idx_shapes):
+        print("=" * 20)
+        print(f"X {x_shape}, Indices {idx_shape}")
+        benchmark_gather_mlx(x_shape, idx_shape)
+        benchmark_gather_torch(x_shape, idx_shape, device=device)
--- a/benchmarks/python/llama_jax_bench.py
+++ b/benchmarks/python/llama_jax_bench.py
@@ -1,198 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import jax
-import jax.numpy as jnp
-from flax import linen as nn
-
-
-class RoPE(nn.Module):
-    dims: int
-    traditional: bool = False
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = jnp.concatenate([rx1, rx2, x[..., self.dims :]], axis=-1)
-        else:
-            rx = jnp.concatenate([rx1, rx2], axis=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = jnp.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
-
-        return rx
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        dtype=jnp.float32,
-    ):
-        D = D // 2
-        positions = jnp.arange(offset, N, dtype=dtype)
-        freqs = jnp.exp(-jnp.arange(0, D, dtype=dtype) * (math.log(base) / D))
-        theta = positions.reshape((-1, 1)) * freqs.reshape((1, -1))
-        costheta = jnp.cos(theta)
-        sintheta = jnp.sin(theta)
-
-        return costheta, sintheta
-
-    @nn.compact
-    def __call__(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.reshape((-1, shape[-2], shape[-1]))
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.reshape(shape)
-
-
-class LlamaAttention(nn.Module):
-    dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        num_heads = self.num_heads
-        dims = self.dims
-
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.key_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.value_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.out_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        keys = keys.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        values = values.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = jnp.concatenate([key_cache, keys], axis=2)
-            values = jnp.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.transpose((0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = jax.nn.softmax(scores, axis=-1)
-        values_hat = (scores @ values).transpose((0, 2, 1, 3)).reshape((B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    dims: int
-    mlp_dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        dims = self.dims
-        mlp_dims = self.mlp_dims
-        num_heads = self.num_heads
-
-        self.attention = LlamaAttention(dims, num_heads, dtype)
-
-        self.norm1 = nn.RMSNorm(param_dtype=self.dtype)
-        self.norm2 = nn.RMSNorm(param_dtype=self.dtype)
-
-        self.linear1 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear2 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear3 = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = jax.nn.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    dtype = jnp.float16
-
-    k1, k2, k3, k4 = jax.random.split(jax.random.PRNGKey(0), 4)
-
-    x = jax.random.normal(k1, (1, 1, D), dtype)
-    cache = [
-        jax.random.normal(k2, [1, H, C, D // H], dtype),
-        jax.random.normal(k3, [1, H, C, D // H], dtype),
-    ]
-
-    layer = LlamaEncoderLayer(D, F, H, dtype=dtype)
-    params = layer.init(k4, x, mask=None, cache=cache)["params"]
-
-    @jax.jit
-    def model_fn(x, mask, cache):
-        return layer.apply({"params": params}, x, mask=mask, cache=cache)
-
-    T = measure(model_fn, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_mlx_bench.py
+++ b/benchmarks/python/llama_mlx_bench.py
@@ -1,118 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.utils
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = nn.RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, False)
-        self.key_proj = nn.Linear(dims, dims, False)
-        self.value_proj = nn.Linear(dims, dims, False)
-        self.out_proj = nn.Linear(dims, dims, False)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = mx.transpose(mx.reshape(queries, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        keys = mx.transpose(mx.reshape(keys, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        values = mx.transpose(mx.reshape(values, (B, L, num_heads, -1)), (0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = mx.concatenate([key_cache, keys], axis=2)
-            values = mx.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = mx.array(math.sqrt(1 / queries.shape[-1]), dtype=queries.dtype)
-        scores = (queries * scale) @ mx.transpose(keys, (0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = mx.softmax(scores, axis=-1)
-        values_hat = mx.reshape(mx.transpose(scores @ values, (0, 2, 1, 3)), (B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = nn.RMSNorm(dims)
-        self.norm2 = nn.RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, False)
-        self.linear2 = nn.Linear(dims, mlp_dims, False)
-        self.linear3 = nn.Linear(mlp_dims, dims, False)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = a * mx.sigmoid(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        mx.eval(y, c)
-
-    start = time.time()
-    rs = []
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        rs.append((y, c))
-    mx.eval(rs)
-    end = time.time()
-
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    mx.set_default_device(mx.gpu)
-    dtype = mx.float16
-
-    layer = LlamaEncoderLayer(D, F, H)
-    layer.update(mlx.utils.tree_map(lambda x: x.astype(dtype), layer.parameters()))
-    k1, k2, k3 = mx.random.split(mx.random.key(0), 3)
-    x = mx.random.normal([1, 1, D], dtype=dtype)
-    cache = [
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-    ]
-    mx.eval(x, cache)
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_torch_bench.py
+++ b/benchmarks/python/llama_torch_bench.py
@@ -1,199 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import torch
-import torch.mps
-import torch.nn as nn
-
-
-def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
-        torch.mps.synchronize()
-
-
-class RoPE(nn.Module):
-    def __init__(self, dims: int, traditional: bool = False):
-        super().__init__()
-        self.dims = dims
-        self.traditional = traditional
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = torch.cat([rx1, rx2, x[..., self.dims :]], dim=-1)
-        else:
-            rx = torch.cat([rx1, rx2], dim=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
-
-        return rx
-
-    def forward(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.view(-1, shape[-2], shape[-1])
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, device=x.device, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.view(*shape)
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        device="cpu",
-        dtype=torch.float32,
-    ):
-        D = D // 2
-        positions = torch.arange(offset, N, dtype=dtype, device=device)
-        freqs = torch.exp(
-            -torch.arange(0, D, dtype=dtype, device=device) * (math.log(base) / D)
-        )
-        theta = positions.view(-1, 1) * freqs.view(1, -1)
-        costheta = torch.cos(theta)
-        sintheta = torch.sin(theta)
-
-        return costheta, sintheta
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dims: int, epsilon: float = 1e-6):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones((dims,)))
-        self.epsilon = epsilon
-
-    def forward(self, x):
-        n = torch.rsqrt(x.square().mean(dim=-1, keepdims=True) + self.epsilon)
-        return self.gamma * x * n
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, bias=False)
-        self.key_proj = nn.Linear(dims, dims, bias=False)
-        self.value_proj = nn.Linear(dims, dims, bias=False)
-        self.out_proj = nn.Linear(dims, dims, bias=False)
-
-    def forward(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        keys = keys.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        values = values.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = torch.cat([key_cache, keys], dim=2)
-            values = torch.cat([value_cache, values], dim=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.permute(0, 1, 3, 2)
-        if mask is not None:
-            scores = scores + mask
-        scores = torch.softmax(scores, dim=-1)
-        values_hat = (scores @ values).permute(0, 2, 1, 3).reshape(B, L, -1)
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = RMSNorm(dims)
-        self.norm2 = RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear2 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear3 = nn.Linear(mlp_dims, dims, bias=False)
-
-    def forward(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = torch.nn.functional.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-@torch.no_grad()
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    device = torch.device("mps")
-    dtype = torch.float16
-
-    layer = LlamaEncoderLayer(D, F, H).to(device).to(dtype)
-    x = torch.randn(1, 1, D).to(device).to(dtype)
-    cache = [
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-    ]
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -44,6 +44,13 @@ def time_matmul():
    time_fn(mx.matmul, a, b)


+def time_maximum():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    b = mx.random.uniform(shape=(32, 1024, 1024))
+    mx.eval(a, b)
+    time_fn(mx.maximum, a, b)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -101,6 +108,7 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_maximum()
    time_exp()
    time_negative()
    time_logsumexp()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -12,7 +12,7 @@ include(CMakeParseArguments)
 #     OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib
 #     SOURCES: List of source files
 #     INCLUDE_DIRS: List of include dirs
-#     DEPS: List of depedency files (like headers)
+#     DEPS: List of dependency files (like headers)
 #
 macro(mlx_build_metallib)
  # Parse args
@@ -32,7 +32,7 @@ macro(mlx_build_metallib)
  # Collect compile options 
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math)

-  # Prepare metllib build command
+  # Prepare metallib build command
  add_custom_command(
    OUTPUT ${MTLLIB_BUILD_TARGET}
    COMMAND xcrun -sdk macosx metal 
--- a/docs/README.md
+++ b/docs/README.md
@@ -26,7 +26,7 @@ python -m http.server <port>

 and point your browser to `http://localhost:<port>`.

-### Push to Github Pages
+### Push to GitHub Pages

 Check-out the `gh-pages` branch (`git switch gh-pages`) and build
 the docs. Then force add the `build/html` directory:
--- a/docs/src/_templates/module-base-class.rst
+++ b/docs/src/_templates/module-base-class.rst
@@ -0,0 +1,33 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. add toctree option to make autodoc generate the pages
+
+.. autoclass:: {{ objname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: .
+   {% for item in attributes %}
+      ~{{ fullname }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: .
+   {% for item in methods %}
+      {%- if item not in inherited_members and item != '__init__' %}
+      ~{{ fullname }}.{{ item }}
+      {%- endif -%}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
--- a/docs/src/_templates/nn-module-template.rst
+++ b/docs/src/_templates/nn-module-template.rst
@@ -1,19 +0,0 @@
-{{ fullname | escape | underline}}
-
-.. currentmodule:: {{ module }}
-
-.. autoclass:: {{ objname }}
-
-   {#{% block methods %}
-
-   {% if methods %}
-   .. rubric:: {{ _('Methods') }}
-
-   .. autosummary::
-   {% for item in methods %}
-      {%- if item not in inherited_members and item != '__init__' %}
-         ~{{ name }}.{{ item }}
-      {%- endif %}
-   {%- endfor %}
-   {% endif %}
-   {% endblock %}#}
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -5,13 +5,15 @@
 import os
 import subprocess

+import mlx.core as mx
+
 # -- Project information -----------------------------------------------------

 project = "MLX"
 copyright = "2023, MLX Contributors"
 author = "MLX Contributors"
-version = "0.0.6"
-release = "0.0.6"
+version = ".".join(mx.__version__.split(".")[:3])
+release = version

 # -- General configuration ---------------------------------------------------

--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -15,7 +15,7 @@ Introducing the Example
 -----------------------

 Let's say that you would like an operation that takes in two arrays, 
-``x`` and ``y``, scales them both by some coefficents ``alpha`` and ``beta``
+``x`` and ``y``, scales them both by some coefficients ``alpha`` and ``beta``
 respectively, and then adds them together to get the result 
 ``z = alpha * x + beta * y``. Well, you can very easily do that by just 
 writing out a function as follows:
@@ -69,7 +69,7 @@ C++ API:
 .. code-block:: C++

    /**
-    *  Scale and sum two vectors elementwise
+    *  Scale and sum two vectors element-wise
    *  z = alpha * x + beta * y
    *
    *  Follow numpy style broadcasting between x and y
@@ -230,7 +230,7 @@ Let's re-implement our operation now in terms of our :class:`Axpby` primitive.

 This operation now handles the following:

-#. Upcast inputs and resolve the the output data type.
+#. Upcast inputs and resolve the output data type.
 #. Broadcast the inputs and resolve the output shape.
 #. Construct the primitive :class:`Axpby` using the given stream, ``alpha``, and ``beta``.
 #. Construct the output :class:`array` using the primitive and the inputs.
@@ -284,14 +284,14 @@ pointwise. This is captured in the templated function :meth:`axpby_impl`.
        T alpha = static_cast<T>(alpha_);
        T beta = static_cast<T>(beta_);

-        // Do the elementwise operation for each output
+        // Do the element-wise operation for each output
        for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
            // Map linear indices to offsets in x and y
            auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
            auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());

            // We allocate the output to be contiguous and regularly strided
-            // (defaults to row major) and hence it doesn't need additonal mapping
+            // (defaults to row major) and hence it doesn't need additional mapping
            out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
        }
    }
@@ -305,7 +305,7 @@ if we encounter an unexpected type.

    /** Fall back implementation for evaluation on CPU */
    void Axpby::eval(const std::vector<array>& inputs, array& out) {
-        // Check the inputs (registered in the op while contructing the out array)
+        // Check the inputs (registered in the op while constructing the out array)
        assert(inputs.size() == 2);
        auto& x = inputs[0];
        auto& y = inputs[1];
@@ -485,7 +485,7 @@ each data type.

    instantiate_axpby(float32, float);
    instantiate_axpby(float16, half);
-    instantiate_axpby(bflot16, bfloat16_t);
+    instantiate_axpby(bfloat16, bfloat16_t);
    instantiate_axpby(complex64, complex64_t);

 This kernel will be compiled into a metal library ``mlx_ext.metallib`` as we 
@@ -537,7 +537,7 @@ below.
        compute_encoder->setComputePipelineState(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
-        // those in the kernel decelaration at axpby.metal
+        // those in the kernel declaration at axpby.metal
        int ndim = out.ndim();
        size_t nelem = out.size();

@@ -568,7 +568,7 @@ below.
        // Fix the 3D size of the launch grid (in terms of threads)
        MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

-        // Launch the grid with the given number of threads divded among
+        // Launch the grid with the given number of threads divided among
        // the given threadgroups
        compute_encoder->dispatchThreads(grid_dims, group_dims);
    }
@@ -581,7 +581,7 @@ to give us the active metal compute command encoder instead of building a
 new one and calling :meth:`compute_encoder->end_encoding` at the end. 
 MLX keeps adding kernels (compute pipelines) to the active command encoder 
 until some specified limit is hit or the compute encoder needs to be flushed 
-for synchronization. MLX also handles enqueuing and commiting the associated 
+for synchronization. MLX also handles enqueuing and committing the associated 
 command buffers as needed. We suggest taking a deeper dive into 
 :class:`metal::Device` if you would like to study this routine further.

@@ -601,8 +601,8 @@ us the following :meth:`Axpby::jvp` and :meth:`Axpby::vjp` implementations.
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) {
        // Forward mode diff that pushes along the tangents
-        // The jvp transform on the the primitive can built with ops
-        // that are scheduled on the same stream as the primtive
+        // The jvp transform on the primitive can built with ops
+        // that are scheduled on the same stream as the primitive

        // If argnums = {0}, we only push along x in which case the
        // jvp is just the tangent scaled by alpha
@@ -642,7 +642,7 @@ own :class:`Primitive`.

 .. code-block:: C++

-    /** Vectorize primitve along given axis */
+    /** Vectorize primitive along given axis */
    std::pair<array, int> Axpby::vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) {
@@ -666,7 +666,7 @@ Let's look at the overall directory structure first.
 | └── setup.py

 * ``extensions/axpby/`` defines the C++ extension library
-* ``extensions/mlx_sample_extensions`` sets out the strucutre for the 
+* ``extensions/mlx_sample_extensions`` sets out the structure for the 
  associated python package
 * ``extensions/bindings.cpp`` provides python bindings for our operation
 * ``extensions/CMakeLists.txt`` holds CMake rules to build the library and 
@@ -677,9 +677,9 @@ Let's look at the overall directory structure first.
 Binding to Python
 ^^^^^^^^^^^^^^^^^^

-We use PyBind11_ to build a Python API for the C++ library. Since bindings 
-for all needed components such as `mlx.core.array`, `mlx.core.stream`, etc. 
-are already provided, adding our :meth:`axpby` becomes very simple!
+We use PyBind11_ to build a Python API for the C++ library. Since bindings for
+components such as :class:`mlx.core.array`, :class:`mlx.core.stream`, etc. are
+already provided, adding our :meth:`axpby` is simple!

 .. code-block:: C++

@@ -697,7 +697,7 @@ are already provided, adding our :meth:`axpby` becomes very simple!
            py::kw_only(),
            "stream"_a = py::none(),
            R"pbdoc(
-                Scale and sum two vectors elementwise
+                Scale and sum two vectors element-wise
                ``z = alpha * x + beta * y``
                
                Follows numpy style broadcasting between ``x`` and ``y``
@@ -840,7 +840,7 @@ This will result in a directory structure as follows:
 | ...

 When you try to install using the command ``python -m pip install .`` 
-(in ``extensions/``), the package will be installed with the same strucutre as 
+(in ``extensions/``), the package will be installed with the same structure as 
 ``extensions/mlx_sample_extensions`` and the C++ and metal library will be 
 copied along with the python binding since they are specified as ``package_data``.

@@ -927,18 +927,18 @@ Results:

 We see some modest improvements right away! 

-This operation is now good to be used to build other operations, 
-in :class:`mlx.nn.Module` calls, and also as a part of graph 
-transformations such as :meth:`grad` and :meth:`simplify`!
+This operation is now good to be used to build other operations, in
+:class:`mlx.nn.Module` calls, and also as a part of graph transformations like
+:meth:`grad`!

 Scripts
 -------

 .. admonition:: Download the code

-   The full example code is available in `mlx-examples <code>`_.
+   The full example code is available in `mlx <code>`_.

-.. code: `TODO_LINK/extensions`_
+.. code: `https://github.com/ml-explore/mlx/tree/main/examples/extensions/`_

 .. _Accelerate: https://developer.apple.com/documentation/accelerate/blas?language=objc
 .. _Metal: https://developer.apple.com/documentation/metal?language=objc
--- a/docs/src/examples/llama-inference.rst
+++ b/docs/src/examples/llama-inference.rst
@@ -371,7 +371,7 @@ Scripts

   The full example code is available in `mlx-examples`_.

-.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llama
+.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama

 .. [1] Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B. and Liu, Y., 2021.
   Roformer: Enhanced transformer with rotary position embedding. arXiv
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -19,7 +19,7 @@ The main differences between MLX and NumPy are:

 The design of MLX is inspired by frameworks like `PyTorch
 <https://pytorch.org/>`_, `Jax <https://github.com/google/jax>`_, and
-`ArrayFire <https://arrayfire.org/>`_. A noteable difference from these
+`ArrayFire <https://arrayfire.org/>`_. A notable difference from these
 frameworks and MLX is the *unified memory model*. Arrays in MLX live in shared
 memory. Operations on MLX arrays can be performed on any of the supported
 device types without performing data copies. Currently supported device types
@@ -35,9 +35,15 @@ are the CPU and GPU.
   :caption: Usage 
   :maxdepth: 1

-   quick_start
-   unified_memory
-   using_streams
+   usage/quick_start
+   usage/lazy_evaluation
+   usage/unified_memory
+   usage/indexing
+   usage/saving_and_loading
+   usage/function_transforms
+   usage/compile
+   usage/numpy
+   usage/using_streams

 .. toctree::
   :caption: Examples
@@ -57,6 +63,7 @@ are the CPU and GPU.
   python/random
   python/transforms
   python/fft
+   python/linalg
   python/nn
   python/optimizers
   python/tree_utils
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,8 +1,8 @@
 Build and Install
 =================

-Install from PyPI
-----------------
+Python Installation
+-------------------

 MLX is available on PyPI. All you have to do to use MLX with your own Apple
 silicon computer is
@@ -21,6 +21,14 @@ To install from PyPI you must meet the following requirements:
    MLX is only available on devices running macOS >= 13.3 
    It is highly recommended to use macOS 14 (Sonoma)

+
+MLX is also available on conda-forge. To install MLX with conda do:
+
+.. code-block:: shell
+
+   conda install conda-forge::mlx
+
+
 Troubleshooting
 ^^^^^^^^^^^^^^^

@@ -48,6 +56,9 @@ Build Requirements
 - `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
 - Xcode >= 14.3 (Xcode >= 15.0 for macOS 14 and above)

+.. note::
+   Ensure your shell environment is native ``arm``, not ``x86`` via Rosetta. If
+   the output of ``uname -p`` is ``x86``, see the :ref:`troubleshooting section <build shell>` below.

 Python API
 ^^^^^^^^^^
@@ -169,6 +180,7 @@ should point to the path to the built metal library.
 Troubleshooting
 ^^^^^^^^^^^^^^^

+
 Metal not found
 ~~~~~~~~~~~~~~~

@@ -189,3 +201,34 @@ Then set the active developer directory:
 .. code-block:: shell

  sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
+
+x86 Shell 
+~~~~~~~~~
+
+.. _build shell:
+
+If the ouptut of ``uname -p``  is ``x86`` then your shell is running as x86 via
+Rosetta instead of natively.
+
+To fix this, find the application in Finder (``/Applications`` for iTerm,
+``/Applications/Utilities`` for Terminal), right-click, and click “Get Info”.
+Uncheck “Open using Rosetta”, close the “Get Info” window, and restart your
+terminal.
+
+Verify the terminal is now running natively the following command:
+
+.. code-block:: shell
+
+  $ uname -p
+  arm
+
+Also check that cmake is using the correct architecture:
+
+.. code-block:: shell
+
+  $ cmake --system-information | grep CMAKE_HOST_SYSTEM_PROCESSOR
+  CMAKE_HOST_SYSTEM_PROCESSOR "arm64"
+
+If you see ``"x86_64"``, try re-installing ``cmake``. If you see ``"arm64"``
+but the build errors out with "Building for x86_64 on macOS is not supported."
+wipe your build cahce with ``rm -rf build/`` and try again.
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -29,9 +29,9 @@ The default floating point type is ``float32`` and the default integer type is
   * - ``uint32``
     - 4 
     - 32-bit unsigned integer 
-   * - ``uint32``
+   * - ``uint64``
     - 8 
-     - 32-bit unsigned integer 
+     - 64-bit unsigned integer 
   * - ``int8``
     - 1 
     - 8-bit signed integer 
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -0,0 +1,12 @@
+.. _linalg:
+
+Linear Algebra
+==============
+
+.. currentmodule:: mlx.core.linalg
+
+.. autosummary:: 
+   :toctree: _autosummary 
+
+    norm
+    qr
--- a/docs/src/python/nn.rst
+++ b/docs/src/python/nn.rst
@@ -123,7 +123,7 @@ To get more detailed information on the arrays in a :class:`Module` you can use
 all the parameters in a :class:`Module` do:

 .. code-block:: python
-    
+
   from mlx.utils import tree_map
   shapes = tree_map(lambda p: p.shape, mlp.parameters())

@@ -131,7 +131,7 @@ As another example, you can count the number of parameters in a :class:`Module`
 with:

 .. code-block:: python
-    
+
   from mlx.utils import tree_flatten
   num_params = sum(v.size for _, v in tree_flatten(mlp.parameters()))

@@ -170,14 +170,14 @@ In detail:
  :meth:`mlx.core.value_and_grad`

 .. autosummary::
-   :recursive:
   :toctree: _autosummary

   value_and_grad
-   Module

 .. toctree::

+   nn/module
   nn/layers
   nn/functions
   nn/losses
+   nn/init
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -15,9 +15,10 @@ simple functions.
   gelu
   gelu_approx
   gelu_fast_approx
-   relu
+   mish
   prelu
+   relu
+   selu
+   softshrink
   silu
   step
-   selu
-   mish
--- a/docs/src/python/nn/init.rst
+++ b/docs/src/python/nn/init.rst
@@ -0,0 +1,45 @@
+.. _init:
+
+.. currentmodule:: mlx.nn.init
+
+Initializers
+------------
+
+The ``mlx.nn.init`` package contains commonly used initializers for neural
+network parameters. Initializers return a function which can be applied to any
+input :obj:`mlx.core.array` to produce an initialized output.
+
+For example:
+
+.. code:: python
+
+   import mlx.core as mx
+   import mlx.nn as nn
+
+   init_fn = nn.init.uniform()
+
+   # Produces a [2, 2] uniform matrix
+   param = init_fn(mx.zeros((2, 2)))
+
+To re-initialize all the parameter in an :obj:`mlx.nn.Module` from say a uniform 
+distribution, you can do:
+
+.. code:: python
+  
+   import mlx.nn as nn
+   model = nn.Sequential(nn.Linear(5, 10), nn.ReLU(), nn.Linear(10, 5))
+   init_fn = nn.init.uniform(low=-0.1, high=0.1)
+   model.apply(init_fn)
+   
+
+.. autosummary::
+   :toctree: _autosummary
+
+   constant
+   normal
+   uniform
+   identity
+   glorot_normal
+   glorot_uniform
+   he_normal
+   he_uniform
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -9,21 +9,30 @@ Layers
   :toctree: _autosummary
   :template: nn-module-template.rst

-   Embedding
-   ReLU
-   PReLU
-   GELU
-   SiLU
-   Step
-   SELU
-   Mish
-   Linear
+   ALiBi
+   BatchNorm
   Conv1d
   Conv2d
-   LayerNorm
-   RMSNorm
+   Dropout
+   Dropout2d
+   Dropout3d
+   Embedding
+   GELU
   GroupNorm
-   RoPE
+   InstanceNorm
+   LayerNorm
+   Linear
+   Mish
   MultiHeadAttention
-   Sequential
+   PReLU
   QuantizedLinear
+   RMSNorm
+   ReLU
+   RoPE
+   SELU
+   Sequential
+   SiLU
+   SinusoidalPositionalEncoding
+   Softshrink
+   Step
+   Transformer
--- a/docs/src/python/nn/losses.rst
+++ b/docs/src/python/nn/losses.rst
@@ -10,9 +10,15 @@ Loss Functions
   :template: nn-module-template.rst

   binary_cross_entropy
+   cosine_similarity_loss
   cross_entropy
+   gaussian_nll_loss
+   hinge_loss
+   huber_loss
   kl_div_loss
   l1_loss
+   log_cosh_loss
+   margin_ranking_loss
   mse_loss
   nll_loss
   smooth_l1_loss
--- a/docs/src/python/nn/module.rst
+++ b/docs/src/python/nn/module.rst
@@ -0,0 +1,37 @@
+Module
+======
+
+.. currentmodule:: mlx.nn
+
+.. autoclass:: Module
+
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.training
+      Module.state
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.apply
+      Module.apply_to_modules
+      Module.children
+      Module.eval
+      Module.filter_and_map
+      Module.freeze
+      Module.leaf_modules
+      Module.load_weights
+      Module.modules
+      Module.named_modules
+      Module.parameters
+      Module.save_weights
+      Module.train
+      Module.trainable_parameters
+      Module.unfreeze
+      Module.update
+      Module.update_modules
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -35,7 +35,10 @@ Operations
   cos
   cosh
   dequantize
+   diag
+   diagonal
   divide
+   divmod
   equal
   erf
   erfinv
@@ -49,6 +52,11 @@ Operations
   greater
   greater_equal
   identity
+   inner
+   isnan
+   isposinf
+   isneginf
+   isinf
   less
   less_equal
   linspace
@@ -59,6 +67,8 @@ Operations
   log1p
   logaddexp
   logical_not
+   logical_and
+   logical_or
   logsumexp
   matmul
   max
@@ -71,18 +81,22 @@ Operations
   negative
   ones
   ones_like
+   outer
   partition
   pad
   prod
   quantize
   quantized_matmul
   reciprocal
+   repeat
   reshape
   round
   rsqrt
   save
   savez
   savez_compressed
+   save_gguf
+   save_safetensors
   sigmoid
   sign
   sin
@@ -102,6 +116,7 @@ Operations
   take_along_axis
   tan
   tanh
+   tensordot
   transpose
   tri
   tril
--- a/docs/src/python/optimizer.rst
+++ b/docs/src/python/optimizer.rst
@@ -0,0 +1,23 @@
+Optimizer
+=========
+
+.. currentmodule:: mlx.optimizers
+
+.. autoclass:: Optimizer 
+
+
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: _autosummary
+
+      Optimizer.state
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Optimizer.apply_gradients
+      Optimizer.init
+      Optimizer.update
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -29,17 +29,20 @@ model's parameters and the **optimizer state**.
            # Compute the new parameters but also the optimizer state.
            mx.eval(model.parameters(), optimizer.state)

+.. toctree::
+
+   optimizer
+
 .. currentmodule:: mlx.optimizers

 .. autosummary::
   :toctree: _autosummary
   :template: optimizers-template.rst

-   OptimizerState
-   Optimizer
   SGD
   RMSprop
   Adagrad
+   Adafactor
   AdaDelta
   Adam
   AdamW
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -33,13 +33,13 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
 .. autosummary:: 
  :toctree: _autosummary

-   seed
-   key
-   split
   bernoulli
   categorical
   gumbel
+   key
   normal
   randint
-   uniform
+   seed
+   split
   truncated_normal
+   uniform
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -9,9 +9,11 @@ Transforms
  :toctree: _autosummary

   eval
+   compile
+   disable_compile
+   enable_compile
   grad
   value_and_grad
   jvp
   vjp
   vmap
-   simplify
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -0,0 +1,430 @@
+.. _compile:
+
+Compilation
+===========
+
+.. currentmodule:: mlx.core
+
+MLX has a :func:`compile` function transformation which compiles computation
+graphs. Function compilation results in smaller graphs by merging common work
+and fusing certain operations. In many cases this can lead to big improvements
+in run-time and memory use.
+
+Getting started with :func:`compile` is simple, but there are some edge cases
+that are good to be aware of for more complex graphs and advanced usage.
+
+Basics of Compile
+-----------------
+
+Let's start with a simple example:
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(-x) + y
+
+  x = mx.array(1.0)
+  y = mx.array(2.0)
+
+  # Regular call, no compilation
+  # Prints: array(2.36788, dtype=float32)
+  print(fun(x, y))
+
+  # Compile the function
+  compiled_fun = mx.compile(fun)
+
+  # Prints: array(2.36788, dtype=float32) 
+  print(compiled_fun(x, y))
+
+The output of both the regular function and the compiled function is the same
+up to numerical precision.
+   
+The first time you call a compiled function, MLX will build the compute
+graph, optimize it, and generate and compile code. This can be relatively
+slow. However, MLX will cache compiled functions, so calling a compiled
+function multiple times will not initiate a new compilation. This means you
+should typically compile functions that you plan to use more than once.
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(-x) + y
+
+  x = mx.array(1.0)
+  y = mx.array(2.0)
+
+  compiled_fun = mx.compile(fun)
+
+  # Compiled here
+  compiled_fun(x, y)
+
+  # Not compiled again
+  compiled_fun(x, y)
+
+  # Not compiled again
+  mx.compile(fun)(x, y)
+
+There are some important cases to be aware of that can cause a function to
+be recompiled:
+
+* Changing the shape or number of dimensions
+* Changing the type of any of the inputs
+* Changing the number of inputs to the function
+
+In certain cases only some of the compilation stack will be rerun (for
+example when changing the shapes) and in other cases the full compilation
+stack will be rerun (for example when changing the types). In general you
+should avoid compiling functions too frequently.
+
+Another idiom to watch out for is compiling functions which get created and
+destroyed frequently. This can happen, for example, when compiling an anonymous
+function in a loop:
+
+.. code-block:: python
+
+  a = mx.array(1.0)
+  # Don't do this, compiles lambda at each iteration
+  for _ in range(5):
+      mx.compile(lambda x: mx.exp(mx.abs(x)))(a)
+
+Example Speedup
+---------------
+
+The :func:`mlx.nn.gelu` is a nonlinear activation function commonly used with
+Transformer-based models. The implementation involves several unary and binary
+element-wise operations:
+
+.. code-block:: python
+
+  def gelu(x):  
+      return x * (1 + mx.erf(x / math.sqrt(2))) / 2
+
+If you use this function with small arrays, it will be overhead bound. If you
+use it with large arrays it will be memory bandwidth bound.  However, all of
+the operations in the ``gelu`` are fusible into a single kernel with
+:func:`compile`. This can speedup both cases considerably.
+
+Let's compare the runtime of the regular function versus the compiled
+function. We'll use the following timing helper which does a warm up and
+handles synchronization:
+
+.. code-block:: python
+
+  import time
+
+  def timeit(fun, x):
+      # warm up
+      for _ in range(10):
+          mx.eval(fun(x))
+
+      tic = time.perf_counter()
+      for _ in range(100):
+          mx.eval(fun(x))
+      toc = time.perf_counter()
+      tpi = 1e3 * (toc - tic) / 100
+      print(f"Time per iteration {tpi:.3f} (ms)")
+
+
+Now make an array, and benchmark both functions:
+
+.. code-block:: python
+
+  x = mx.random.uniform(shape=(32, 1000, 4096))
+  timeit(nn.gelu, x)
+  timeit(mx.compile(nn.gelu), x)
+
+On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
+five times faster.
+
+.. note::
+
+  As of the latest MLX, CPU functions are not fully compiled. Compiling CPU
+  functions can still be helpful, but won't typically result in as large a
+  speedup as compiling operations that run on the GPU.
+
+
+Debugging
+---------
+
+When a compiled function is first called, it is traced with placeholder
+inputs. This means you can't evaluate arrays (for example to print their
+contents) inside compiled functions.
+
+.. code-block:: python
+
+  @mx.compile
+  def fun(x):
+      z = -x
+      print(z)  # Crash
+      return mx.exp(z)
+
+  fun(mx.array(5.0))
+
+For debugging, inspecting arrays can be helpful. One way to do that is to
+globally disable compilation using the :func:`disable_compile` function or
+``MLX_DISABLE_COMPILE`` flag. For example the following is okay even though
+``fun`` is compiled:
+
+.. code-block:: python
+
+  @mx.compile
+  def fun(x):
+      z = -x
+      print(z) # Okay
+      return mx.exp(z)
+
+  mx.disable_compile()
+  fun(mx.array(5.0))
+
+
+Pure Functions
+--------------
+
+Compiled functions are intended to be *pure*; that is they should not have side
+effects. For example:
+
+.. code-block:: python
+
+  state = []
+
+  @mx.compile
+  def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z)
+
+  fun(mx.array(1.0), mx.array(2.0))
+  # Crash!
+  print(state)
+
+After the first call of ``fun``, the ``state`` list will hold a placeholder
+array. The placeholder does not have any data; it is only used to build the
+computation graph. Printing such an array results in a crash.
+
+You have two options to deal with this. The first option is to simply return
+``state`` as an output:
+
+.. code-block:: python
+
+   state = []
+
+   @mx.compile
+   def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z), state
+
+    _, state = fun(mx.array(1.0), mx.array(2.0))
+    # Prints [array(3, dtype=float32)]
+    print(state)
+
+In some cases returning updated state can be pretty inconvenient. Hence,
+:func:`compile` has a parameter to capture implicit outputs:
+
+.. code-block:: python
+
+  from functools import partial
+
+  state = []
+
+  # Tell compile to capture state as an output
+  @partial(mx.compile, outputs=state)
+  def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z), state
+
+  fun(mx.array(1.0), mx.array(2.0))
+  # Prints [array(3, dtype=float32)]
+  print(state)
+
+This is particularly useful for compiling a function which includes an update
+to a container of arrays, as is commonly done when training the parameters of a
+:class:`mlx.nn.Module`.
+
+Compiled functions will also treat any inputs not in the parameter list as
+constants. For example:
+
+.. code-block:: python
+
+  state = [mx.array(1.0)]
+
+  @mx.compile
+  def fun(x):
+      return x + state[0]
+
+  # Prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+  # Update state
+  state[0] = mx.array(5.0)
+
+  # Still prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+In order to have the change of state reflected in the outputs of ``fun`` you
+again have two options. The first option is to simply pass ``state`` as input
+to the function. In some cases this can be pretty inconvenient. Hence,
+:func:`compile` also has a parameter to capture implicit inputs:
+
+.. code-block:: python
+
+  from functools import partial
+  state = [mx.array(1.0)]
+
+  # Tell compile to capture state as an input
+  @partial(mx.compile, inputs=state)
+  def fun(x):
+      return x + state[0]
+
+  # Prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+  # Update state
+  state[0] = mx.array(5.0)
+
+  # Prints array(6, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+
+Compiling Training Graphs 
+-------------------------
+
+This section will step through how to use :func:`compile` with a simple example
+of a common setup: training a model with :obj:`mlx.nn.Module` using an
+:obj:`mlx.optimizers.Optimizer` with state. We will show how to compile the
+full forward, backward, and update with :func:`compile`.
+
+To start, here is the simple example without any compilation:
+
+.. code-block:: python 
+
+  import mlx.core as mx
+  import mlx.nn as nn
+  import mlx.optimizers as optim
+
+  # 4 examples with 10 features each
+  x = mx.random.uniform(shape=(4, 10))
+
+  # 0, 1 targets
+  y = mx.array([0, 1, 0, 1])
+
+  # Simple linear model
+  model = nn.Linear(10, 1)
+
+  # SGD with momentum
+  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)
+
+  def loss_fn(model, x, y):
+      logits = model(x).squeeze()
+      return nn.losses.binary_cross_entropy(logits, y)
+
+  loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
+
+  # Perform 10 steps of gradient descent
+  for it in range(10):
+      loss, grads = loss_and_grad_fn(model, x, y)
+      optimizer.update(model, grads)
+      mx.eval(model.parameters(), optimizer.state)
+
+To compile the update we can put it all in a function and compile it with the
+appropriate input and output captures. Here's the same example but compiled:
+
+.. code-block:: python 
+
+  import mlx.core as mx
+  import mlx.nn as nn
+  import mlx.optimizers as optim
+  from functools import partial
+
+  # 4 examples with 10 features each
+  x = mx.random.uniform(shape=(4, 10))
+
+  # 0, 1 targets
+  y = mx.array([0, 1, 0, 1])
+
+  # Simple linear model
+  model = nn.Linear(10, 1)
+
+  # SGD with momentum
+  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)
+
+  def loss_fn(model, x, y):
+      logits = model(x).squeeze()
+      return nn.losses.binary_cross_entropy(logits, y)
+
+  # The state that will be captured as input and output
+  state = [model.state, optimizer.state]
+      
+  @partial(mx.compile, inputs=state, outputs=state)
+  def step(x, y):
+      loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
+      loss, grads = loss_and_grad_fn(model, x, y)
+      optimizer.update(model, grads)
+      return loss
+
+  # Perform 10 steps of gradient descent
+  for it in range(10):
+      loss = step(x, y)
+      # Evaluate the model and optimizer state
+      mx.eval(state)
+      print(loss)
+
+
+.. note::
+
+  If you are using a module which performs random sampling such as
+  :func:`mlx.nn.Dropout`, make sure you also include ``mx.random.state`` in the
+  ``state`` captured by :func:`compile`, i.e. ``state = [model.state,
+  optimizer.state, mx.random.state]``.
+
+
+.. note::
+
+   For more examples of compiling full training graphs checkout the  `MLX
+   Examples <https://github.com/ml-explore/mlx-examples>`_ GitHub repo.
+
+Transformations with Compile
+----------------------------
+
+In MLX function transformations are composable. You can apply any function
+transformation to the output of any other function transformation. For more on
+this, see the documentation on :ref:`function transforms
+<function_transforms>`.
+
+Compiling transformed functions works just as expected:
+
+.. code-block:: python
+
+  grad_fn = mx.grad(mx.exp)
+
+  compiled_grad_fn = mx.compile(grad_fn)
+
+  # Prints: array(2.71828, dtype=float32)
+  print(grad_fn(mx.array(1.0)))
+
+  # Also prints: array(2.71828, dtype=float32)
+  print(compiled_grad_fn(mx.array(1.0)))
+
+.. note::
+
+   In order to compile as much as possible, a transformation of a compiled
+   function will not by default be compiled. To compile the transformed
+   function simply pass it through :func:`compile`. 
+
+You can also compile functions which themselves call compiled functions. A
+good practice is to compile the outer most function to give :func:`compile`
+the most opportunity to optimize the computation graph:
+
+.. code-block:: python
+
+  @mx.compile
+  def inner(x):
+      return mx.exp(-mx.abs(x))
+
+  def outer(x):
+      inner(inner(x))
+
+  # Compiling the outer function is good to do as it will likely
+  # be faster even though the inner functions are compiled
+  fun = mx.compile(outer)
--- a/docs/src/usage/function_transforms.rst
+++ b/docs/src/usage/function_transforms.rst
@@ -0,0 +1,191 @@
+.. _function_transforms:
+
+Function Transforms
+===================
+
+.. currentmodule:: mlx.core
+
+MLX uses composable function transformations for automatic differentiation,
+vectorization, and compute graph optimizations. To see the complete list of
+function transformations check-out the :ref:`API documentation <transforms>`.
+
+The key idea behind composable function transformations is that every
+transformation returns a function which can be further transformed.
+
+Here is a simple example:
+
+.. code-block:: shell
+
+   >>> dfdx = mx.grad(mx.sin)
+   >>> dfdx(mx.array(mx.pi))
+   array(-1, dtype=float32)
+   >>> mx.cos(mx.array(mx.pi))
+   array(-1, dtype=float32)
+
+
+The output of :func:`grad` on :func:`sin` is simply another function. In this
+case it is the gradient of the sine function which is exactly the cosine
+function. To get the second derivative you can do: 
+
+.. code-block:: shell
+
+   >>> d2fdx2 = mx.grad(mx.grad(mx.sin))
+   >>> d2fdx2(mx.array(mx.pi / 2))
+   array(-1, dtype=float32)
+   >>> mx.sin(mx.array(mx.pi / 2))
+   array(1, dtype=float32)
+
+Using :func:`grad` on the output of :func:`grad` is always ok. You keep
+getting higher order derivatives.
+
+Any of the MLX function transformations can be composed in any order to any
+depth. See the following sections for more information on :ref:`automatic
+differentiaion <auto diff>` and :ref:`automatic vectorization <vmap>`.
+For more information on :func:`compile` see the :ref:`compile documentation <compile>`.
+
+
+Automatic Differentiation
+-------------------------
+
+.. _auto diff:
+
+Automatic differentiation in MLX works on functions rather than on implicit
+graphs. 
+
+.. note::
+
+   If you are coming to MLX from PyTorch, you no longer need functions like
+   ``backward``, ``zero_grad``, and ``detach``, or properties like
+   ``requires_grad``.
+
+The most basic example is taking the gradient of a scalar-valued function as we
+saw above. You can use the :func:`grad` and :func:`value_and_grad` function to
+compute gradients of more complex functions. By default these functions compute
+the gradient with respect to the first argument:
+
+.. code-block:: python
+
+   def loss_fn(w, x, y):
+      return mx.mean(mx.square(w * x - y))
+
+   w = mx.array(1.0)
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to w:
+   grad_fn = mx.grad(loss_fn)
+   dloss_dw = grad_fn(w, x, y)
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+   # To get the gradient with respect to x we can do:
+   grad_fn = mx.grad(loss_fn, argnums=1)
+   dloss_dx = grad_fn(w, x, y)
+   # Prints array([-1, 1], dtype=float32)
+   print(dloss_dx)
+
+
+One way to get the loss and gradient is to call ``loss_fn`` followed by
+``grad_fn``, but this can result in a lot of redundant work. Instead, you
+should use :func:`value_and_grad`. Continuing the above example:
+
+
+.. code-block:: python
+
+   # Computes the gradient of loss_fn with respect to w:
+   loss_and_grad_fn = mx.value_and_grad(loss_fn)
+   loss, dloss_dw = loss_and_grad_fn(w, x, y)
+
+   # Prints array(1, dtype=float32)
+   print(loss)
+
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+
+You can also take the gradient with respect to arbitrarily nested Python
+containers of arrays (specifically any of :obj:`list`, :obj:`tuple`, or
+:obj:`dict`).
+
+Suppose we wanted a weight and a bias parameter in the above example. A nice
+way to do that is the following:
+
+.. code-block:: python
+
+   def loss_fn(params, x, y):
+      w, b = params["weight"], params["bias"]
+      h = w * x + b 
+      return mx.mean(mx.square(h - y))
+
+   params = {"weight": mx.array(1.0), "bias": mx.array(0.0)}
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to both the
+   # weight and bias:
+   grad_fn = mx.grad(loss_fn)
+   grads = grad_fn(params, x, y)
+
+   # Prints
+   # {'weight': array(-1, dtype=float32), 'bias': array(0, dtype=float32)}
+   print(grads)
+
+Notice the tree structure of the parameters is preserved in the gradients.
+
+In some cases you may want to stop gradients from propagating through a 
+part of the function. You can use the :func:`stop_gradient` for that.
+
+
+Automatic Vectorization
+-----------------------
+
+.. _vmap:
+
+Use :func:`vmap` to automate vectorizing complex functions. Here we'll go
+through a basic and contrived example for the sake of clarity, but :func:`vmap`
+can be quite powerful for more complex functions which are difficult to optimize
+by hand.
+
+.. warning::
+
+   Some operations are not yet supported with :func:`vmap`. If you encounter an error
+   like: ``ValueError: Primitive's vmap not implemented.`` file an `issue
+   <https://github.com/ml-explore/mlx/issues>`_ and include your function.
+   We will prioritize including it.
+
+A naive way to add the elements from two sets of vectors is with a loop:
+
+.. code-block:: python
+
+  xs = mx.random.uniform(shape=(4096, 100))
+  ys = mx.random.uniform(shape=(100, 4096))
+
+  def naive_add(xs, ys):
+      return [xs[i] + ys[:, i] for i in range(xs.shape[1])]
+
+Instead you can use :func:`vmap` to automatically vectorize the addition:
+
+.. code-block:: python
+   
+   # Vectorize over the second dimension of x and the
+   # first dimension of y
+   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(1, 0))
+
+The ``in_axes`` parameter can be used to specify which dimensions of the
+corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
+where the vectorized axes should be in the outputs. 
+
+Let's time these two different versions:
+
+.. code-block:: python
+
+  import timeit
+
+  print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
+  print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))
+
+On an M1 Max the naive version takes in total ``0.390`` seconds whereas the
+vectorized version takes only ``0.025`` seconds, more than ten times faster.
+
+Of course, this operation is quite contrived. A better approach is to simply do
+``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -0,0 +1,123 @@
+.. _indexing:
+
+Indexing Arrays
+===============
+
+.. currentmodule:: mlx.core
+
+For the most part, indexing an MLX :obj:`array` works the same as indexing a
+NumPy :obj:`numpy.ndarray`. See the `NumPy documentation
+<https://numpy.org/doc/stable/user/basics.indexing.html>`_ for more details on
+how that works.
+
+For example, you can use regular integers and slices (:obj:`slice`) to index arrays:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> arr[3]
+  array(3, dtype=int32)
+  >>> arr[-2]  # negative indexing works
+  array(8, dtype=int32)
+  >>> arr[2:8:2] # start, stop, stride
+  array([2, 4, 6], dtype=int32)
+
+For multi-dimensional arrays, the ``...`` or :obj:`Ellipsis` syntax works as in NumPy:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8).reshape(2, 2, 2)
+  >>> arr[:, :, 0]
+  array(3, dtype=int32)
+  array([[0, 2],
+         [4, 6]], dtype=int32
+  >>> arr[..., 0]
+  array([[0, 2],
+         [4, 6]], dtype=int32
+
+You can index with ``None`` to create a new axis:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8)
+  >>> arr.shape
+  [8]
+  >>> arr[None].shape
+  [1, 8]
+
+
+You can also use an :obj:`array` to index another :obj:`array`:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> idx = mx.array([5, 7]) 
+  >>> arr[idx]
+  array([5, 7], dtype=int32)
+
+Mixing and matching integers, :obj:`slice`, ``...``, and :obj:`array` indices
+works just as in NumPy.
+
+Other functions which may be useful for indexing arrays are :func:`take` and
+:func:`take_along_axis`.
+
+Differences from NumPy
+----------------------
+
+.. Note::
+
+  MLX indexing is different from NumPy indexing in two important ways:
+
+  * Indexing does not perform bounds checking. Indexing out of bounds is
+    undefined behavior.
+  * Boolean mask based indexing is not yet supported.
+
+The reason for the lack of bounds checking is that exceptions cannot propagate
+from the GPU. Performing bounds checking for array indices before launching the
+kernel would be extremely inefficient.
+
+Indexing with boolean masks is something that MLX may support in the future. In
+general, MLX has limited support for operations for which outputs
+*shapes* are dependent on input *data*. Other examples of these types of
+operations which MLX does not yet support include :func:`numpy.nonzero` and the
+single input version of :func:`numpy.where`.
+
+In Place Updates 
+----------------
+
+In place updates to indexed arrays are possible in MLX. For example:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[2] = 0
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Just as in NumPy, in place updates will be reflected in all references to the
+same array:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> b = a
+  >>> b[2] = 0
+  >>> b
+  array([1, 2, 0], dtype=int32)
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Transformations of functions which use in-place updates are allowed and work as
+expected. For example:
+
+.. code-block:: python
+
+   def fun(x, idx):
+       x[idx] = 2.0
+       return x.sum()
+
+   dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0]), mx.array([1]))
+   print(dfdx)  # Prints: array([1, 0, 1], dtype=float32)
+
+In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
+and ones elsewhere.
--- a/docs/src/usage/lazy_evaluation.rst
+++ b/docs/src/usage/lazy_evaluation.rst
@@ -0,0 +1,144 @@
+.. _lazy eval:
+
+Lazy Evaluation
+===============
+
+.. currentmodule:: mlx.core
+
+Why Lazy Evaluation
+-------------------
+
+When you perform operations in MLX, no computation actually happens. Instead a
+compute graph is recorded. The actual computation only happens if an
+:func:`eval` is performed.
+
+MLX uses lazy evaluation because it has some nice features, some of which we
+describe below. 
+
+Transforming Compute Graphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Lazy evaluation let's us record a compute graph without actually doing any
+computations. This is useful for function transformations like :func:`grad` and
+:func:`vmap` and graph optimizations.
+
+Currently, MLX does not compile and rerun compute graphs. They are all
+generated dynamically. However, lazy evaluation makes it much easier to
+integrate compilation for future performance enhancements.
+
+Only Compute What You Use
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In MLX you do not need to worry as much about computing outputs that are never
+used. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      a = fun1(x)
+      b = expensive_fun(a)
+      return a, b
+
+  y, _ = fun(x)
+
+Here, we never actually compute the output of ``expensive_fun``. Use this
+pattern with care though, as the graph of ``expensive_fun`` is still built, and
+that has some cost associated to it.
+
+Similarly, lazy evaluation can be beneficial for saving memory while keeping
+code simple. Say you have a very large model ``Model`` derived from
+:obj:`mlx.nn.Module`. You can instantiate this model with ``model = Model()``.
+Typically, this will initialize all of the weights as ``float32``, but the
+initialization does not actually compute anything until you perform an
+:func:`eval`. If you update the model with ``float16`` weights, your maximum
+consumed memory will be half that required if eager computation was used
+instead.
+
+This pattern is simple to do in MLX thanks to lazy computation:
+
+.. code-block:: python
+
+  model = Model() # no memory used yet
+  model.load_weights("weights_fp16.safetensors")
+
+When to Evaluate
+----------------
+
+A common question is when to use :func:`eval`. The trade-off is between
+letting graphs get too large and not batching enough useful work.
+
+For example:
+
+.. code-block:: python
+
+  for _ in range(100):
+       a = a + b
+       mx.eval(a)
+       b = b * 2
+       mx.eval(b)
+
+This is a bad idea because there is some fixed overhead with each graph
+evaluation. On the other hand, there is some slight overhead which grows with
+the compute graph size, so extremely large graphs (while computationally
+correct) can be costly.
+
+Luckily, a wide range of compute graph sizes work pretty well with MLX:
+anything from a few tens of operations to many thousands of operations per
+evaluation should be okay.
+
+Most numerical computations have an iterative outer loop (e.g. the iteration in
+stochastic gradient descent). A natural and usually efficient place to use
+:func:`eval` is at each iteration of this outer loop.
+
+Here is a concrete example:
+
+.. code-block:: python
+
+   for batch in dataset:
+
+       # Nothing has been evaluated yet
+       loss, grad = value_and_grad_fn(model, batch)
+
+       # Still nothing has been evaluated
+       optimizer.update(model, grad)
+
+       # Evaluate the loss and the new parameters which will
+       # run the full gradient computation and optimizer update
+       mx.eval(loss, model.parameters())
+
+
+An important behavior to be aware of is when the graph will be implicitly
+evaluated. Anytime you ``print`` an array, convert it to an
+:obj:`numpy.ndarray`, or otherwise access it's memory via :obj:`memoryview`,
+the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
+saving functions) will also evaluate the array.
+
+
+Calling :func:`array.item` on a scalar array will also evaluate it. In the
+example above, printing the loss (``print(loss)``) or adding the loss scalar to
+a list (``losses.append(loss.item())``) would cause a graph evaluation. If 
+these lines are before ``mx.eval(loss, model.parameters())`` then this
+will be a partial evaluation, computing only the forward pass.
+
+Also, calling :func:`eval` on an array or set of arrays multiple times is
+perfectly fine. This is effectively a no-op.
+
+.. warning::
+
+  Using scalar arrays for control-flow will cause an evaluation.
+
+Here is an example:
+
+.. code-block:: python
+
+   def fun(x):
+       h, y = first_layer(x)
+       if y > 0:  # An evaluation is done here!
+           z  = second_layer_a(h)
+       else:
+           z  = second_layer_b(h)
+       return z
+
+Using arrays for control flow should be done with care. The above example works
+and can even be used with gradient transformations. However, this can be very
+inefficient if evaluations are done too frequently.
--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -0,0 +1,108 @@
+.. _numpy:
+
+Conversion to NumPy and Other Frameworks
+========================================
+
+MLX array implements the `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
+Let's convert an array to NumPy and back.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import numpy as np
+
+  a = mx.arange(3)
+  b = np.array(a) # copy of a
+  c = mx.array(b) # copy of b
+
+.. note::
+
+    Since NumPy does not support ``bfloat16`` arrays, you will need to convert to ``float16`` or ``float32`` first:
+    ``np.array(a.astype(mx.float32))``.
+    Otherwise, you will receive an error like: ``Item size 2 for PEP 3118 buffer format string does not match the dtype V item size 0.``
+
+By default, NumPy copies data to a new array. This can be prevented by creating an array view:
+
+.. code-block:: python
+
+  a = mx.arange(3)
+  a_view = np.array(a, copy=False)
+  print(a_view.flags.owndata) # False
+  a_view[0] = 1
+  print(a[0].item()) # 1
+
+A NumPy array view is a normal NumPy array, except that it does not own its memory.
+This means writing to the view is reflected in the original array.
+
+While this is quite powerful to prevent copying arrays, it should be noted that external changes to the memory of arrays cannot be reflected in gradients.
+
+Let's demonstrate this in an example:
+
+.. code-block:: python
+
+  def f(x):
+      x_view = np.array(x, copy=False)
+      x_view[:] *= x_view # modify memory without telling mx
+      return x.sum()
+
+  x = mx.array([3.0])
+  y, df = mx.value_and_grad(f)(x)
+  print("f(x) = x² =", y.item()) # 9.0
+  print("f'(x) = 2x !=", df.item()) # 1.0
+
+
+The function ``f`` indirectly modifies the array ``x`` through a memory view.
+However, this modification is not reflected in the gradient, as seen in the last line outputting ``1.0``,
+representing the gradient of the sum operation alone.
+The squaring of ``x`` occurs externally to MLX, meaning that no gradient is incorporated.
+It's important to note that a similar issue arises during array conversion and copying.
+For instance, a function defined as ``mx.array(np.array(x)**2).sum()`` would also result in an incorrect gradient,
+even though no in-place operations on MLX memory are executed.
+
+PyTorch
+-------
+
+.. warning:: 
+
+   PyTorch Support for :obj:`memoryview` is experimental and can break for
+   multi-dimensional arrays. Casting to NumPy first is advised for now.
+
+PyTorch supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import torch
+
+  a = mx.arange(3)
+  b = torch.tensor(memoryview(a))
+  c = mx.array(b.numpy())
+
+Conversion from PyTorch tensors back to arrays must be done via intermediate NumPy arrays with ``numpy()``.
+
+JAX
+---
+JAX fully supports the buffer protocol.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import jax.numpy as jnp
+
+  a = mx.arange(3)
+  b = jnp.array(a)
+  c = mx.array(b)
+
+TensorFlow
+----------
+
+TensorFlow supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import tensorflow as tf
+
+  a = mx.arange(3)
+  b = tf.constant(memoryview(a))
+  c = mx.array(b)
--- a/docs/src/usage/quick_start.rst
+++ b/docs/src/usage/quick_start.rst
@@ -40,6 +40,9 @@ automatically evaluate the array.
  >> np.array(c)   # Also evaluates c
  array([2., 4., 6., 8.], dtype=float32)

+
+See the page on :ref:`Lazy Evaluation <lazy eval>` for more details.
+
 Function and Graph Transformations
 ----------------------------------

--- a/docs/src/usage/saving_and_loading.rst
+++ b/docs/src/usage/saving_and_loading.rst
@@ -0,0 +1,81 @@
+.. _saving_and_loading:
+
+Saving and Loading Arrays
+=========================
+
+.. currentmodule:: mlx.core
+
+MLX supports multiple array serialization formats.
+
+.. list-table:: Serialization Formats
+   :widths: 20 8 25 25 
+   :header-rows: 1
+
+   * - Format 
+     - Extension 
+     - Function
+     - Notes 
+   * - NumPy 
+     - ``.npy`` 
+     - :func:`save`
+     - Single arrays only
+   * - NumPy archive 
+     - ``.npz`` 
+     - :func:`savez` and :func:`savez_compressed`
+     - Multiple arrays 
+   * - Safetensors
+     - ``.safetensors`` 
+     - :func:`save_safetensors`
+     - Multiple arrays 
+   * - GGUF 
+     - ``.gguf`` 
+     - :func:`save_gguf`
+     - Multiple arrays
+
+The :func:`load` function will load any of the supported serialization
+formats. It determines the format from the extensions. The output of
+:func:`load` depends on the format. 
+
+Here's an example of saving a single array to a file:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> mx.save("array", a)
+
+The array ``a`` will be saved in the file ``array.npy`` (notice the extension
+is automatically added). Including the extension is optional; if it is missing
+it will be added. You can load the array with:
+
+.. code-block:: shell
+
+   >>> mx.load("array.npy", a)
+   array([1], dtype=float32)
+
+Here's an example of saving several arrays to a single file:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> b = mx.array([2.0])
+   >>> mx.savez("arrays", a, b=b)
+
+For compatibility with :func:`numpy.savez` the MLX :func:`savez` takes arrays
+as arguments. If the keywords are missing, then default names will be
+provided. This can be loaded with:
+
+.. code-block:: shell
+
+   >>> mx.load("arrays.npz")
+   {'b': array([2], dtype=float32), 'arr_0': array([1], dtype=float32)}
+
+In this case :func:`load` returns a dictionary of names to arrays.
+
+The functions :func:`save_safetensors` and :func:`save_gguf` are similar to
+:func:`savez`, but they take as input a :obj:`dict` of string names to arrays:
+
+.. code-block:: shell
+
+   >>> a = mx.array([1.0])
+   >>> b = mx.array([2.0])
+   >>> mx.save_safetensors("arrays", {"a": a, "b": b})
--- a/docs/src/usage/unified_memory.rst
+++ b/docs/src/usage/unified_memory.rst
--- a/docs/src/usage/using_streams.rst
+++ b/docs/src/usage/using_streams.rst
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -57,7 +57,7 @@ void array_basics() {
  assert(z.shape(0) == 2);
  assert(z.shape(1) == 2);

-  // To actually run the compuation you must evaluate `z`.
+  // To actually run the computation you must evaluate `z`.
  // Under the hood, mlx records operations in a graph.
  // The variable `z` is a node in the graph which points to its operation
  // and inputs. When `eval` is called on an array (or arrays), the array and
--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.24)
+cmake_minimum_required(VERSION 3.27)

 project(mlx_sample_extensions LANGUAGES CXX)

@@ -63,4 +63,4 @@ target_link_libraries(mlx_sample_extensions PRIVATE mlx_ext)

 if(BUILD_SHARED_LIBS)
  target_link_options(mlx_sample_extensions PRIVATE -Wl,-rpath,@loader_path)
-endif()
+endif()
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -26,7 +26,7 @@ namespace mlx::core {
 ///////////////////////////////////////////////////////////////////////////////

 /**
- *  Scale and sum two vectors elementwise
+ *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
@@ -91,21 +91,24 @@ void axpby_impl(
  T alpha = static_cast<T>(alpha_);
  T beta = static_cast<T>(beta_);

-  // Do the elementwise operation for each output
+  // Do the element-wise operation for each output
  for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
    // Map linear indices to offsets in x and y
    auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
    auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());

    // We allocate the output to be contiguous and regularly strided
-    // (defaults to row major) and hence it doesn't need additonal mapping
+    // (defaults to row major) and hence it doesn't need additional mapping
    out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
  }
 }

 /** Fall back implementation for evaluation on CPU */
-void Axpby::eval(const std::vector<array>& inputs, array& out) {
-  // Check the inputs (registered in the op while contructing the out array)
+void Axpby::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& out_arr) {
+  auto out = out_arr[0];
+  // Check the inputs (registered in the op while constructing the out array)
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
@@ -175,7 +178,10 @@ void axpby_impl_accelerate(
 }

 /** Evaluate primitive on CPU using accelerate specializations */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
+  auto out = outarr[0];
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
@@ -189,13 +195,15 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
  }

  // Fall back to common backend if specializations are not available
-  eval(inputs, out);
+  eval(inputs, outarr);
 }

-#else // Accelerate not avaliable
+#else // Accelerate not available

 /** Evaluate primitive on CPU falling back to common backend */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
  eval(inputs, out);
 }

@@ -208,8 +216,11 @@ void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
 #ifdef _METAL_

 /** Evaluate primitive on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outarr) {
  // Prepare inputs
+  auto out = outarr[0];
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
@@ -254,7 +265,7 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder->setComputePipelineState(kernel);

  // Kernel parameters are registered with buffer indices corresponding to
-  // those in the kernel decelaration at axpby.metal
+  // those in the kernel declaration at axpby.metal
  int ndim = out.ndim();
  size_t nelem = out.size();

@@ -287,7 +298,7 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Fix the 3D size of the launch grid (in terms of threads)
  MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

-  // Launch the grid with the given number of threads divded among
+  // Launch the grid with the given number of threads divided among
  // the given threadgroups
  compute_encoder->dispatchThreads(grid_dims, group_dims);
 }
@@ -295,7 +306,9 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 #else // Metal is not available

 /** Fail evaluation on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+void Axpby::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
  throw std::runtime_error("Axpby has no GPU implementation.");
 }

@@ -306,13 +319,13 @@ void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
 ///////////////////////////////////////////////////////////////////////////////

 /** The Jacobian-vector product. */
-array Axpby::jvp(
+std::vector<array> Axpby::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  // Forward mode diff that pushes along the tangents
-  // The jvp transform on the the primitive can built with ops
-  // that are scheduled on the same stream as the primtive
+  // The jvp transform on the primitive can built with ops
+  // that are scheduled on the same stream as the primitive

  // If argnums = {0}, we only push along x in which case the
  // jvp is just the tangent scaled by alpha
@@ -321,32 +334,33 @@ array Axpby::jvp(
  if (argnums.size() > 1) {
    auto scale = argnums[0] == 0 ? alpha_ : beta_;
    auto scale_arr = array(scale, tangents[0].dtype());
-    return multiply(scale_arr, tangents[0], stream());
+    return {multiply(scale_arr, tangents[0], stream())};
  }
  // If, argnums = {0, 1}, we take contributions from both
  // which gives us jvp = tangent_x * alpha + tangent_y * beta
  else {
-    return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+    return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
  }
 }

 /** The vector-Jacobian product. */
 std::vector<array> Axpby::vjp(
    const std::vector<array>& primals,
-    const array& cotan,
-    const std::vector<int>& argnums) {
+    const std::vector<array>& cotangents,
+    const std::vector<int>& argnums,
+    const std::vector<array>&) {
  // Reverse mode diff
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto scale = arg == 0 ? alpha_ : beta_;
-    auto scale_arr = array(scale, cotan.dtype());
-    vjps.push_back(multiply(scale_arr, cotan, stream()));
+    auto scale_arr = array(scale, cotangents[0].dtype());
+    vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
  }
  return vjps;
 }

-/** Vectorize primitve along given axis */
-std::pair<array, int> Axpby::vmap(
+/** Vectorize primitive along given axis */
+std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Axpby has no vmap implementation.");
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -12,7 +12,7 @@ namespace mlx::core {
 ///////////////////////////////////////////////////////////////////////////////

 /**
- *  Scale and sum two vectors elementwise
+ *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
@@ -39,14 +39,16 @@ class Axpby : public Primitive {
   * A primitive must know how to evaluate itself on the CPU/GPU
   * for the given inputs and populate the output array.
   *
-   * To avoid unecessary allocations, the evaluation function
+   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
-  void eval_cpu(const std::vector<array>& inputs, array& out) override;
-  void eval_gpu(const std::vector<array>& inputs, array& out) override;
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;
+  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& out)
+      override;

  /** The Jacobian-vector product. */
-  array jvp(
+  std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;
@@ -54,8 +56,9 @@ class Axpby : public Primitive {
  /** The vector-Jacobian product. */
  std::vector<array> vjp(
      const std::vector<array>& primals,
-      const array& cotan,
-      const std::vector<int>& argnums) override;
+      const std::vector<array>& cotangents,
+      const std::vector<int>& argnums,
+      const std::vector<array>& outputs) override;

  /**
   * The primitive must know how to vectorize itself across
@@ -63,7 +66,7 @@ class Axpby : public Primitive {
   * representing the vectorized computation and the axis which
   * corresponds to the output vectorized dimension.
   */
-  std::pair<array, int> vmap(
+  std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;

@@ -80,7 +83,7 @@ class Axpby : public Primitive {
  float beta_;

  /** Fall back implementation for evaluation on CPU */
-  void eval(const std::vector<array>& inputs, array& out);
+  void eval(const std::vector<array>& inputs, std::vector<array>& out);
 };

 } // namespace mlx::core
--- a/examples/extensions/axpby/axpby.metal
+++ b/examples/extensions/axpby/axpby.metal
@@ -59,5 +59,5 @@ template <typename T>

 instantiate_axpby(float32, float);
 instantiate_axpby(float16, half);
-instantiate_axpby(bflot16, bfloat16_t);
+instantiate_axpby(bfloat16, bfloat16_t);
 instantiate_axpby(complex64, complex64_t);
--- a/examples/extensions/bindings.cpp
+++ b/examples/extensions/bindings.cpp
@@ -23,7 +23,7 @@ PYBIND11_MODULE(mlx_sample_extensions, m) {
      py::kw_only(),
      "stream"_a = py::none(),
      R"pbdoc(
-        Scale and sum two vectors elementwise
+        Scale and sum two vectors element-wise
        ``z = alpha * x + beta * y``
        
        Follows numpy style broadcasting between ``x`` and ``y``
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "pybind11>=2.10", "cmake>=3.24", "mlx @ git+https://github.com/mlx-explore/mlx@main"]
+build-backend = "setuptools.build_meta"
--- a/examples/python/linear_regression.py
+++ b/examples/python/linear_regression.py
@@ -41,6 +41,6 @@ error_norm = mx.sum(mx.square(w - w_star)).item() ** 0.5
 throughput = num_iters / (toc - tic)

 print(
-    f"Loss {loss.item():.5f}, |w-w*| = {error_norm:.5f}, "
+    f"Loss {loss.item():.5f}, L2 distance: |w-w*| = {error_norm:.5f}, "
    f"Throughput {throughput:.5f} (it/s)"
 )
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,21 +5,22 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/graph_utils.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/random.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h
 )

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
-
-if (MLX_BUILD_ACCELERATE) 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
+if (MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
 else()
  target_sources(
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -9,7 +9,7 @@
 namespace mlx::core::allocator {

 Buffer malloc(size_t size) {
-  auto buffer = allocator().malloc(size);
+  auto buffer = allocator().malloc(size, /* allow_swap */ true);
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc] Unable to allocate " << size << " bytes.";
@@ -22,7 +22,7 @@ void free(Buffer buffer) {
  return allocator().free(buffer);
 }

-Buffer CommonAllocator::malloc(size_t size) {
+Buffer CommonAllocator::malloc(size_t size, bool) {
  return Buffer{std::malloc(size)};
 }

@@ -38,6 +38,11 @@ Buffer malloc_or_wait(size_t size) {
    buffer = allocator().malloc(size);
  }

+  // Try swapping if needed
+  if (size && !buffer.ptr()) {
+    buffer = allocator().malloc(size, /* allow_swap = */ true);
+  }
+
  if (size && !buffer.ptr()) {
    std::ostringstream msg;
    msg << "[malloc_or_wait] Unable to allocate " << size << " bytes.";
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@@ -37,9 +37,9 @@ void free(Buffer buffer);
 Buffer malloc_or_wait(size_t size);

 class Allocator {
-  /** Abstract base clase for a memory allocator. */
+  /** Abstract base class for a memory allocator. */
 public:
-  virtual Buffer malloc(size_t size) = 0;
+  virtual Buffer malloc(size_t size, bool allow_swap = false) = 0;
  virtual void free(Buffer buffer) = 0;

  Allocator() = default;
@@ -55,7 +55,7 @@ Allocator& allocator();
 class CommonAllocator : public Allocator {
  /** A general CPU allocator. */
 public:
-  virtual Buffer malloc(size_t size) override;
+  virtual Buffer malloc(size_t size, bool allow_swap = false) override;
  virtual void free(Buffer buffer) override;

 private:
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <functional>

@@ -6,6 +6,7 @@
 #include "mlx/ops.h"
 #include "mlx/primitives.h"
 #include "mlx/transforms.h"
+#include "mlx/transforms_impl.h"

 namespace mlx::core {

@@ -21,6 +22,12 @@ std::pair<size_t, std::vector<size_t>> cum_prod(const std::vector<int>& shape) {
  return {cum_prod, strides};
 }

+/** Return true if we are currently performing a function transformation in
+ * order to keep the graph when evaluating tracer arrays. */
+bool in_tracing() {
+  return detail::InTracing::in_tracing();
+}
+
 } // namespace

 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
@@ -32,7 +39,7 @@ array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
 array::array(
    const std::vector<int>& shape,
    Dtype dtype,
-    std::unique_ptr<Primitive> primitive,
+    std::shared_ptr<Primitive> primitive,
    const std::vector<array>& inputs)
    : array_desc_(std::make_shared<ArrayDesc>(
          shape,
@@ -40,6 +47,34 @@ array::array(
          std::move(primitive),
          inputs)) {}

+array::array(
+    std::vector<int> shape,
+    Dtype dtype,
+    std::shared_ptr<Primitive> primitive,
+    std::vector<array>&& inputs)
+    : array_desc_(std::make_shared<ArrayDesc>(
+          std::move(shape),
+          dtype,
+          std::move(primitive),
+          std::move(inputs))) {}
+
+std::vector<array> array::make_arrays(
+    const std::vector<std::vector<int>>& shapes,
+    const std::vector<Dtype>& dtypes,
+    std::shared_ptr<Primitive> primitive,
+    const std::vector<array>& inputs) {
+  std::vector<array> outputs;
+  for (int i = 0; i < shapes.size(); ++i) {
+    outputs.push_back(array(shapes[i], dtypes[i], primitive, inputs));
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    auto siblings = outputs;
+    siblings.erase(siblings.begin() + i);
+    outputs[i].set_siblings(std::move(siblings), i);
+  }
+  return outputs;
+}
+
 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
          std::vector<int>{static_cast<int>(data.size())},
@@ -58,12 +93,26 @@ array::array(
 }

 void array::detach() {
+  for (auto& s : array_desc_->siblings) {
+    s.array_desc_->inputs.clear();
+    s.array_desc_->siblings.clear();
+    s.array_desc_->position = 0;
+    s.array_desc_->depth = 0;
+    s.array_desc_->primitive = nullptr;
+  }
  array_desc_->inputs.clear();
+  array_desc_->siblings.clear();
+  array_desc_->position = 0;
+  array_desc_->depth = 0;
  array_desc_->primitive = nullptr;
 }

-void array::eval(bool retain_graph /* = false */) {
-  mlx::core::eval({*this}, retain_graph);
+void array::eval() {
+  mlx::core::eval({*this});
+}
+
+bool array::is_tracer() const {
+  return array_desc_->is_tracer && in_tracing();
 }

 void array::set_data(allocator::Buffer buffer, deleter_t d) {
@@ -108,6 +157,14 @@ void array::copy_shared_buffer(const array& other) {
  copy_shared_buffer(other, other.strides(), other.flags(), other.data_size());
 }

+void array::move_shared_buffer(array other) {
+  array_desc_->data = std::move(other.array_desc_->data);
+  array_desc_->strides = other.strides();
+  array_desc_->flags = other.flags();
+  array_desc_->data_size = other.data_size();
+  array_desc_->data_ptr = other.array_desc_->data_ptr;
+}
+
 array::ArrayDesc::ArrayDesc(const std::vector<int>& shape, Dtype dtype)
    : shape(shape), dtype(dtype) {
  std::tie(size, strides) = cum_prod(shape);
@@ -116,21 +173,43 @@ array::ArrayDesc::ArrayDesc(const std::vector<int>& shape, Dtype dtype)
 array::ArrayDesc::ArrayDesc(
    const std::vector<int>& shape,
    Dtype dtype,
-    std::unique_ptr<Primitive> primitive,
+    std::shared_ptr<Primitive> primitive,
    const std::vector<array>& inputs)
    : shape(shape),
      dtype(dtype),
      primitive(std::move(primitive)),
      inputs(inputs) {
-  std::tie(size, strides) = cum_prod(shape);
-  for (auto& in : inputs) {
+  std::tie(size, strides) = cum_prod(this->shape);
+  for (auto& in : this->inputs) {
    is_tracer |= in.is_tracer();
+    depth = std::max(in.graph_depth(), depth);
  }
+  depth++;
 }

-// Needed because the Primitive type used in array.h is incomplete and the
-// compiler needs to see the call to the desctructor after the type is complete.
-array::ArrayDesc::~ArrayDesc() = default;
+array::ArrayDesc::ArrayDesc(
+    std::vector<int>&& shape,
+    Dtype dtype,
+    std::shared_ptr<Primitive> primitive,
+    std::vector<array>&& inputs)
+    : shape(std::move(shape)),
+      dtype(dtype),
+      primitive(std::move(primitive)),
+      inputs(std::move(inputs)) {
+  std::tie(size, strides) = cum_prod(this->shape);
+  for (auto& in : this->inputs) {
+    is_tracer |= in.is_tracer();
+    depth = std::max(in.graph_depth(), depth);
+  }
+  depth++;
+}
+
+array::ArrayIterator::ArrayIterator(const array& arr, int idx)
+    : arr(arr), idx(idx) {
+  if (arr.ndim() == 0) {
+    throw std::invalid_argument("Cannot iterate over 0-d array.");
+  }
+}

 array::ArrayIterator::reference array::ArrayIterator::operator*() const {
  auto start = std::vector<int>(arr.ndim(), 0);
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -1,5 +1,4 @@
 // Copyright © 2023 Apple Inc.
-
 #pragma once
 #include <algorithm>
 #include <cstdint>
@@ -116,11 +115,14 @@ class array {
  };

  /** Evaluate the array. */
-  void eval(bool retain_graph = false);
+  void eval();

  /** Get the value from a scalar array. */
  template <typename T>
-  T item(bool retain_graph = false);
+  T item();
+
+  template <typename T>
+  T item() const;

  struct ArrayIterator {
    using iterator_category = std::random_access_iterator_tag;
@@ -128,11 +130,7 @@ class array {
    using value_type = const array;
    using reference = value_type;

-    explicit ArrayIterator(const array& arr, int idx = 0) : arr(arr), idx(idx) {
-      if (arr.ndim() == 0) {
-        throw std::invalid_argument("Cannot iterate over 0-d array.");
-      }
-    }
+    explicit ArrayIterator(const array& arr, int idx = 0);

    reference operator*() const;

@@ -174,7 +172,19 @@ class array {
  array(
      const std::vector<int>& shape,
      Dtype dtype,
-      std::unique_ptr<Primitive> primitive,
+      std::shared_ptr<Primitive> primitive,
+      const std::vector<array>& inputs);
+
+  array(
+      std::vector<int> shape,
+      Dtype dtype,
+      std::shared_ptr<Primitive> primitive,
+      std::vector<array>&& inputs);
+
+  static std::vector<array> make_arrays(
+      const std::vector<std::vector<int>>& shapes,
+      const std::vector<Dtype>& dtypes,
+      std::shared_ptr<Primitive> primitive,
      const std::vector<array>& inputs);

  /** A unique identifier for an array. */
@@ -182,6 +192,11 @@ class array {
    return reinterpret_cast<std::uintptr_t>(array_desc_.get());
  }

+  /** A unique identifier for an arrays primitive. */
+  std::uintptr_t primitive_id() const {
+    return reinterpret_cast<std::uintptr_t>(array_desc_->primitive.get());
+  }
+
  struct Data {
    allocator::Buffer buffer;
    deleter_t d;
@@ -209,6 +224,11 @@ class array {
    return *(array_desc_->primitive);
  };

+  /** A shared pointer to the array's primitive. */
+  std::shared_ptr<Primitive>& primitive_ptr() const {
+    return array_desc_->primitive;
+  };
+
  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
@@ -219,12 +239,42 @@ class array {
    return array_desc_->inputs;
  };

-  /** A non-const reference to the array's inputs so that they can be used to
-   * edit the graph. */
-  std::vector<array>& editable_inputs() {
+  std::vector<array>& inputs() {
    return array_desc_->inputs;
  }

+  /** True indicates the arrays buffer is safe to reuse */
+  bool is_donatable() const {
+    return array_desc_.use_count() == 1 && (array_desc_->data.use_count() == 1);
+  }
+
+  /** The array's siblings. */
+  const std::vector<array>& siblings() const {
+    return array_desc_->siblings;
+  };
+
+  void set_siblings(std::vector<array> siblings, uint16_t position) {
+    array_desc_->siblings = std::move(siblings);
+    array_desc_->position = position;
+  }
+
+  /** The outputs of the array's primitive (i.e. this array and
+   * its siblings) in the order the primitive expects. */
+  std::vector<array> outputs() const {
+    auto idx = array_desc_->position;
+    std::vector<array> outputs;
+    outputs.reserve(siblings().size() + 1);
+    outputs.insert(outputs.end(), siblings().begin(), siblings().begin() + idx);
+    outputs.push_back(*this);
+    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
+    return outputs;
+  };
+
+  /** The depth of the array in the graph. Evaluated arrays have depth 0. */
+  uint16_t graph_depth() const {
+    return array_desc_->depth;
+  }
+
  /** Detach the array from the graph. */
  void detach();

@@ -245,6 +295,12 @@ class array {
    return array_desc_->data->buffer;
  };

+  // Return a copy of the shared pointer
+  // to the array::Data struct
+  std::shared_ptr<Data> data_shared_ptr() const {
+    return array_desc_->data;
+  }
+  // Return a raw pointer to the arrays data
  template <typename T>
  T* data() {
    return static_cast<T*>(array_desc_->data_ptr);
@@ -265,9 +321,7 @@ class array {
    array_desc_->is_tracer = is_tracer;
  }
  // Check if the array is a tracer array
-  bool is_tracer() const {
-    return array_desc_->is_tracer;
-  }
+  bool is_tracer() const;

  void set_data(allocator::Buffer buffer, deleter_t d = allocator::free);

@@ -287,6 +341,8 @@ class array {

  void copy_shared_buffer(const array& other);

+  void move_shared_buffer(array other);
+
  void overwrite_descriptor(const array& other) {
    array_desc_ = other.array_desc_;
  }
@@ -301,7 +357,7 @@ class array {
    std::vector<size_t> strides;
    size_t size;
    Dtype dtype;
-    std::unique_ptr<Primitive> primitive{nullptr};
+    std::shared_ptr<Primitive> primitive{nullptr};

    // Indicates an array is being used in a graph transform
    // and should not be detached from the graph
@@ -323,22 +379,34 @@ class array {
    Flags flags;

    std::vector<array> inputs;
+    // An array to keep track of the siblings from a multi-output
+    // primitive.
+    std::vector<array> siblings;
+    // The arrays position in the output list
+    uint32_t position{0};
+
+    // The depth of the array in the graph.
+    uint16_t depth{0};

    explicit ArrayDesc(const std::vector<int>& shape, Dtype dtype);

    explicit ArrayDesc(
        const std::vector<int>& shape,
        Dtype dtype,
-        std::unique_ptr<Primitive> primitive,
+        std::shared_ptr<Primitive> primitive,
        const std::vector<array>& inputs);

-    ~ArrayDesc();
+    explicit ArrayDesc(
+        std::vector<int>&& shape,
+        Dtype dtype,
+        std::shared_ptr<Primitive> primitive,
+        std::vector<array>&& inputs);
  };

  // The ArrayDesc contains the details of the materialized array including the
  // shape, strides, the data type. It also includes
  // the primitive which knows how to compute the array's data from its inputs
-  // and a the list of array's inputs for the primitive.
+  // and the list of array's inputs for the primitive.
  std::shared_ptr<ArrayDesc> array_desc_{nullptr};
 };

@@ -381,11 +449,23 @@ array::array(
 }

 template <typename T>
-T array::item(bool retain_graph /* = false */) {
+T array::item() {
  if (size() != 1) {
    throw std::invalid_argument("item can only be called on arrays of size 1.");
  }
-  eval(retain_graph);
+  eval();
+  return *data<T>();
+}
+
+template <typename T>
+T array::item() const {
+  if (size() != 1) {
+    throw std::invalid_argument("item can only be called on arrays of size 1.");
+  }
+  if (!is_evaled()) {
+    throw std::invalid_argument(
+        "item() const can only be called on evaled arrays");
+  }
  return *data<T>();
 }

--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -29,12 +29,16 @@ std::tuple<bool, size_t, array> check_transpose(const array& arr) {
  }
 }

-inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+inline void matmul_cblas_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[matmul_cblas] on CPU currently only supports float32");
  }
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
@@ -42,6 +46,14 @@ inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  for (int i = 0; i < (a.size() / (M * K)); ++i) {
    cblas_sgemm(
        CblasRowMajor,
@@ -50,21 +62,34 @@ inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
        M,
        N,
        K,
-        1.0f, // alpha
+        alpha, // alpha
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
        lda,
        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
        ldb,
-        0.0f, // beta
+        beta, // beta
        out.data<float>() + M * N * i,
        out.shape(-1) // ldc
    );
  }
 }

-inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
-  // TODO: Update to utilize BNNS broadcasting
+inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_cblas_general(a_pre, b_pre, out);
+}
+
+inline void matmul_bnns_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  // TODO: Update to utilize BNNS broadcasting

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
@@ -72,11 +97,19 @@ inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());

  const BNNSLayerParametersBroadcastMatMul gemm_params{
-      /* float alpha = */ 1.0,
-      /* float beta = */ 0.0,
+      /* float alpha = */ alpha,
+      /* float beta = */ beta,
      /* bool transA = */ a_transposed,
      /* bool transB = */ b_transposed,
      /* bool quadratic = */ false,
@@ -157,6 +190,12 @@ inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
  BNNSFilterDestroy(bnns_filter);
 }

+inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
+  // TODO: Update to utilize BNNS broadcasting
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_bnns_general(a_pre, b_pre, out);
+}
+
 } // namespace

 void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
@@ -166,4 +205,16 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  return matmul_bnns(inputs[0], inputs[1], out);
 }

-} // namespace mlx::core
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  if (out.dtype() == float32) {
+    return matmul_cblas_general(inputs[0], inputs[1], out, alpha_, beta_);
+  }
+  return matmul_bnns_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #include <cassert>
 #include <cmath>
@@ -17,6 +17,12 @@
    primitive::eval(inputs, out);                                          \
  }

+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
 namespace mlx::core {

 // Use the default implementation for the following primitives
@@ -27,8 +33,12 @@ DEFAULT(ArgSort)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
 DEFAULT(Ceil)
+DEFAULT_MULTI(Compiled)
 DEFAULT(Concatenate)
 DEFAULT(Copy)
+DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(Depends)
+DEFAULT_MULTI(DivMod)
 DEFAULT(Equal)
 DEFAULT(Erf)
 DEFAULT(ErfInv)
@@ -41,10 +51,15 @@ DEFAULT(Less)
 DEFAULT(LessEqual)
 DEFAULT(Load)
 DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
 DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
 DEFAULT(NotEqual)
 DEFAULT(Pad)
 DEFAULT(Partition)
+DEFAULT_MULTI(QRF)
 DEFAULT(RandomBits)
 DEFAULT(Reshape)
 DEFAULT(Round)
@@ -52,6 +67,7 @@ DEFAULT(Scatter)
 DEFAULT(Sigmoid)
 DEFAULT(Sign)
 DEFAULT(Slice)
+DEFAULT_MULTI(Split)
 DEFAULT(Sort)
 DEFAULT(StopGradient)
 DEFAULT(Transpose)
@@ -60,21 +76,11 @@ void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else if (in.dtype() == int32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, in.data_size());
  } else if (is_unsigned(in.dtype())) {
    // No-op for unsigned types
    out.copy_shared_buffer(in);
@@ -127,12 +133,8 @@ void ArcCos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvacosf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -143,12 +145,8 @@ void ArcCosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvacoshf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -159,12 +157,8 @@ void ArcSin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvasinf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -175,12 +169,8 @@ void ArcSinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvasinhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -191,12 +181,8 @@ void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvatanf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -207,12 +193,8 @@ void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvatanhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -224,30 +206,23 @@ void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  if (in.flags().contiguous) {
-    auto allocfn = [&in, &out]() {
-      out.set_data(
-          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-          in.data_size(),
-          in.strides(),
-          in.flags());
-    };
    // Use accelerate functions if possible
    if (in.dtype() == float32 && out.dtype() == uint32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfixu32(
          in.data<float>(), 1, out.data<uint32_t>(), 1, in.data_size());
      return;
    } else if (in.dtype() == float32 && out.dtype() == int32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfix32(in.data<float>(), 1, out.data<int32_t>(), 1, in.data_size());
      return;
    } else if (in.dtype() == uint32 && out.dtype() == float32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vfltu32(
          in.data<uint32_t>(), 1, out.data<float>(), 1, in.data_size());
      return;
    } else if (in.dtype() == int32 && out.dtype() == float32) {
-      allocfn();
+      set_unary_output_data(in, out);
      vDSP_vflt32(in.data<int32_t>(), 1, out.data<float>(), 1, in.data_size());
      return;
    }
@@ -259,12 +234,8 @@ void Cos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvcosf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -275,12 +246,8 @@ void Cosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvcoshf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -368,12 +335,8 @@ void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (is_floating_point(out.dtype())) {
    unary_fp(in, out, [](auto x) { return std::exp(x); });
@@ -400,12 +363,8 @@ void Log::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    switch (base_) {
      case Base::e:
        vvlogf(
@@ -429,12 +388,8 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvlog1pf(
        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
  } else if (is_floating_point(out.dtype())) {
@@ -446,47 +401,6 @@ void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  if (out.dtype() == float32) {
-    binary(
-        a,
-        b,
-        out,
-        [](auto x, auto y) { return (x > y) ? x : y; },
-        UseDefaultBinaryOp(),
-        UseDefaultBinaryOp(),
-        [](const auto* a, const auto* b, auto* out, int n) {
-          vDSP_vmax((const float*)a, 1, (const float*)b, 1, (float*)out, 1, n);
-        });
-  } else {
-    binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
-  }
-}
-
-void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-
-  if (out.dtype() == float32) {
-    binary(
-        a,
-        b,
-        out,
-        [](auto x, auto y) { return (x < y) ? x : y; },
-        UseDefaultBinaryOp(),
-        UseDefaultBinaryOp(),
-        [](const auto* a, const auto* b, auto* out, int n) {
-          vDSP_vmin((const float*)a, 1, (const float*)b, 1, (float*)out, 1, n);
-        });
-  } else {
-    binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
-  }
-}
-
 void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -516,13 +430,8 @@ void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
-    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
-    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, size);
+    set_unary_output_data(in, out);
+    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
  } else {
    unary(in, out, [](auto x) { return -x; });
  }
@@ -535,7 +444,13 @@ void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.dtype() == float32 && a.flags().row_contiguous &&
      b.flags().row_contiguous) {
    int size = a.size();
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
    vvpowf(out.data<float>(), b.data<float>(), a.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -577,12 +492,8 @@ void Sin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvsinf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -593,12 +504,8 @@ void Sinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvsinhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -609,12 +516,8 @@ void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    auto size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
  } else {
    unary(in, out, [](auto x) { return x * x; });
@@ -625,12 +528,8 @@ void Sqrt::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    if (recip_) {
      vvrsqrtf(out.data<float>(), in.data<float>(), &size);
    } else {
@@ -685,12 +584,8 @@ void Tan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvtanf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
@@ -701,12 +596,8 @@ void Tanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
    int size = in.data_size();
-    out.set_data(
-        allocator::malloc_or_wait(size * out.itemsize()),
-        size,
-        in.strides(),
-        in.flags());
    vvtanhf(out.data<float>(), in.data<float>(), &size);
  } else {
    eval(inputs, out);
--- a/mlx/backend/accelerate/quantized.cpp
+++ b/mlx/backend/accelerate/quantized.cpp
@@ -76,20 +76,16 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& scales = inputs[2];
  auto& biases = inputs[3];

-  if (w.strides()[0] != 1) {
-    throw std::runtime_error("The quantized weight should be transposed");
-  }
+  bool condition =
+      (transpose_ && x.flags().row_contiguous && w.flags().row_contiguous &&
+       scales.flags().row_contiguous && biases.flags().row_contiguous &&
+       x.dtype() == float32 && bits_ == 4 && group_size_ == 64);

-  if (!x.flags().row_contiguous || !scales.flags().row_contiguous ||
-      !biases.flags().row_contiguous) {
-    throw std::runtime_error("x, scales and biases should be row contiguous.");
-  }
-
-  if (x.dtype() == float32 && bits_ == 4 && group_size_ == 64) {
+  if (condition) {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
    int K = x.shape(-1);
    int M = x.size() / K;
-    int N = w.shape(1);
+    int N = out.shape(-1);
    _qmm_t_4_64(
        out.data<float>(),
        x.data<float>(),
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -3,6 +3,7 @@ target_sources(
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
@@ -16,4 +17,5 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
 )
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -6,6 +6,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/binary_two.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -75,6 +76,61 @@ void Add::eval(const std::vector<array>& inputs, array& out) {
  binary(a, b, out, [](auto x, auto y) { return x + y; });
 }

+void DivMod::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  auto integral_op = [](auto x, auto y) {
+    return std::make_pair(x / y, x % y);
+  };
+  auto float_op = [](auto x, auto y) {
+    return std::make_pair(std::trunc(x / y), std::fmod(x, y));
+  };
+  switch (outputs[0].dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, outputs, integral_op);
+    case uint8:
+      binary_op<uint8_t>(a, b, outputs, integral_op);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, outputs, integral_op);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, outputs, integral_op);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, outputs, integral_op);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, outputs, integral_op);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, outputs, integral_op);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, outputs, integral_op);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, outputs, integral_op);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, outputs, float_op);
+      break;
+    case float32:
+      binary_op<float>(a, b, outputs, float_op);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, outputs, float_op);
+      break;
+    case complex64:
+      // Should never get here
+      throw std::runtime_error("[DivMod] Complex type not supported");
+      break;
+  }
+}
+
 void Divide::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
@@ -177,14 +233,33 @@ void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
+
+  if (is_floating_point(out.dtype())) {
+    binary(a, b, out, [](auto x, auto y) {
+      if (std::isnan(x)) {
+        return x;
+      }
+      return (x > y) ? x : y;
+    });
+  } else {
+    binary(a, b, out, [](auto x, auto y) { return (x > y) ? x : y; });
+  }
 }

 void Minimum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
-  binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
+  if (is_floating_point(out.dtype())) {
+    binary(a, b, out, [](auto x, auto y) {
+      if (std::isnan(x)) {
+        return x;
+      }
+      return (x < y) ? x : y;
+    });
+  } else {
+    binary(a, b, out, [](auto x, auto y) { return (x < y) ? x : y; });
+  }
 }

 void Multiply::eval(const std::vector<array>& inputs, array& out) {
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,7 +1,6 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
-
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
@@ -40,29 +39,83 @@ void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
-    BinaryOpType bopt) {
+    BinaryOpType bopt,
+    bool donate_with_move = false) {
  switch (bopt) {
    case ScalarScalar:
      out.set_data(
          allocator::malloc_or_wait(out.itemsize()), 1, a.strides(), a.flags());
      break;
    case ScalarVector:
-      out.set_data(
-          allocator::malloc_or_wait(b.data_size() * out.itemsize()),
-          b.data_size(),
-          b.strides(),
-          b.flags());
+      if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(b.data_size() * out.itemsize()),
+            b.data_size(),
+            b.strides(),
+            b.flags());
+      }
      break;
    case VectorScalar:
+      if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            a.data_size(),
+            a.strides(),
+            a.flags());
+      }
+      break;
    case VectorVector:
-      out.set_data(
-          allocator::malloc_or_wait(a.data_size() * out.itemsize()),
-          a.data_size(),
-          a.strides(),
-          a.flags());
+      if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(
+            allocator::malloc_or_wait(a.data_size() * out.itemsize()),
+            a.data_size(),
+            a.strides(),
+            a.flags());
+      }
      break;
    case General:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      if (a.is_donatable() && a.flags().row_contiguous &&
+          a.itemsize() == out.itemsize() && a.size() == out.size()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(a);
+        } else {
+          out.copy_shared_buffer(a);
+        }
+      } else if (
+          b.is_donatable() && b.flags().row_contiguous &&
+          b.itemsize() == out.itemsize() && b.size() == out.size()) {
+        if (donate_with_move) {
+          out.move_shared_buffer(b);
+        } else {
+          out.copy_shared_buffer(b);
+        }
+      } else {
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      }
      break;
  }
 }
@@ -73,6 +126,12 @@ struct UseDefaultBinaryOp {
    // Should we throw? This should normally never be called.
    assert(false);
  }
+
+  template <typename T, typename U>
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    // Should we throw? This should normally never be called.
+    assert(false);
+  }
 };

 template <typename T, typename U, typename Op>
@@ -89,6 +148,18 @@ struct DefaultVectorScalar {
      a++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    T scalar = *b;
+    while (size-- > 0) {
+      auto dst = op(*a, scalar);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      a++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
@@ -105,6 +176,18 @@ struct DefaultScalarVector {
      b++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    T scalar = *a;
+    while (size-- > 0) {
+      auto dst = op(scalar, *b);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      b++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
@@ -121,6 +204,18 @@ struct DefaultVectorVector {
      b++;
    }
  }
+
+  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
+    while (size-- > 0) {
+      auto dst = op(*a, *b);
+      *dst_a = dst.first;
+      *dst_b = dst.second;
+      dst_a++;
+      dst_b++;
+      a++;
+      b++;
+    }
+  }
 };

 template <typename T, typename U, typename Op>
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -0,0 +1,536 @@
+// Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename U, typename Op>
+void binary_op_dims1(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < out_a.size(); ++i) {
+    auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+    dst_a[i] = dst.first;
+    dst_b[i] = dst.second;
+    a_idx += a.strides()[0];
+    b_idx += b.strides()[0];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims1(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int stride) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; i++) {
+    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+    a_idx += a.strides()[0];
+    b_idx += b.strides()[0];
+    dst_a += stride;
+    dst_b += stride;
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims2(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+      dst_a[out_idx] = dst.first;
+      dst_b[out_idx++] = dst.second;
+      a_idx += a.strides()[1];
+      b_idx += b.strides()[1];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims2(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int stride) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+      a_idx += a.strides()[1];
+      b_idx += b.strides()[1];
+      dst_a += stride;
+      dst_b += stride;
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims3(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      for (size_t k = 0; k < a.shape()[2]; ++k) {
+        auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+        dst_a[out_idx] = dst.first;
+        dst_b[out_idx++] = dst.second;
+        a_idx += a.strides()[2];
+        b_idx += b.strides()[2];
+      }
+      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
+      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dims4(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  size_t a_idx = 0;
+  size_t b_idx = 0;
+  size_t out_idx = 0;
+  for (size_t i = 0; i < a.shape()[0]; ++i) {
+    for (size_t j = 0; j < a.shape()[1]; ++j) {
+      for (size_t k = 0; k < a.shape()[2]; ++k) {
+        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {
+          auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
+          dst_a[out_idx] = dst.first;
+          dst_b[out_idx++] = dst.second;
+          a_idx += a.strides()[3];
+          b_idx += b.strides()[3];
+        }
+        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];
+        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];
+      }
+      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
+      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
+    }
+    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
+    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op) {
+  switch (out_a.ndim()) {
+    case 1:
+      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 2:
+      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 3:
+      binary_op_dims3<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+    case 4:
+      binary_op_dims4<T, U, Op>(a, b, out_a, out_b, op);
+      return;
+  }
+
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  for (size_t i = 0; i < out_a.size(); i++) {
+    int a_idx = elem_to_loc(i, a.shape(), a.strides());
+    int b_idx = elem_to_loc(i, b.shape(), b.strides());
+    std::tie(dst_a[i], dst_b[i]) = op(a_ptr[a_idx], b_ptr[b_idx]);
+  }
+}
+
+template <typename T, typename U, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    int dim,
+    int stride) {
+  // Number of dimensions to loop over for vectorized ops
+  switch (dim) {
+    case 1:
+      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op, stride);
+      return;
+    case 2:
+      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op, stride);
+      return;
+  }
+
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* dst_a = out_a.data<U>();
+  U* dst_b = out_b.data<U>();
+  for (size_t i = 0; i < out_a.size(); i += stride) {
+    int a_idx = elem_to_loc(i, a.shape(), a.strides());
+    int b_idx = elem_to_loc(i, b.shape(), b.strides());
+    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
+    dst_a += stride;
+    dst_b += stride;
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    typename OpSV,
+    typename OpVS,
+    typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out_a,
+    array& out_b,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out_a, bopt);
+  set_binary_op_output_data(a, b, out_b, bopt);
+
+  // The full computation is scalar scalar so call the base op once
+  if (bopt == ScalarScalar) {
+    std::tie(*(out_a.data<U>()), *(out_b.data<U>())) =
+        op(*a.data<T>(), *b.data<T>());
+    return;
+  }
+
+  // The full computation is scalar vector so delegate to the op
+  if (bopt == ScalarVector) {
+    opsv(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        b.data_size());
+    return;
+  }
+
+  // The full computation is vector scalar so delegate to the op
+  if (bopt == VectorScalar) {
+    opvs(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        a.data_size());
+    return;
+  }
+
+  // The full computation is vector vector so delegate to the op
+  if (bopt == VectorVector) {
+    opvv(
+        a.data<T>(),
+        b.data<T>(),
+        out_a.data<U>(),
+        out_b.data<U>(),
+        out_a.size());
+    return;
+  }
+
+  // General computation so let's try to optimize
+
+  // Get the left-most dim such that the array is row contiguous after
+  auto& strides = out_a.strides();
+  auto leftmost_rc_dim = [&strides](const array& arr) {
+    int d = arr.ndim() - 1;
+    for (; d >= 0 && arr.strides()[d] == strides[d]; d--) {
+    }
+    return d + 1;
+  };
+  auto a_rc_dim = leftmost_rc_dim(a);
+  auto b_rc_dim = leftmost_rc_dim(b);
+
+  // Get the left-most dim such that the array is a broadcasted "scalar" after
+  auto leftmost_s_dim = [](const array& arr) {
+    int d = arr.ndim() - 1;
+    for (; d >= 0 && arr.strides()[d] == 0; d--) {
+    }
+    return d + 1;
+  };
+  auto a_s_dim = leftmost_s_dim(a);
+  auto b_s_dim = leftmost_s_dim(b);
+
+  auto ndim = out_a.ndim();
+
+  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
+  int dim = ndim;
+  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
+    bopt = VectorVector;
+    dim = d;
+    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
+    bopt = VectorScalar;
+    dim = d;
+    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
+    bopt = ScalarVector;
+    dim = d;
+  }
+
+  // Can be sure dim > 0 since otherwise we would have used one of the fully
+  // contiguous methods above. Except for the case that the flags do not
+  // correspond to the underlying contiguity.
+  size_t stride;
+  if (dim == 0 || strides[dim - 1] < 16) {
+    stride = 1;
+    bopt = General;
+    dim = ndim;
+  } else {
+    stride = strides[dim - 1];
+  }
+
+  switch (bopt) {
+    case VectorVector:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvv, dim, stride);
+      break;
+    case VectorScalar:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvs, dim, stride);
+      break;
+    case ScalarVector:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opsv, dim, stride);
+      break;
+    default:
+      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, op);
+      break;
+  }
+}
+
+template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  // TODO: The following mess of constexpr evaluations can probably be achieved
+  //       with template specializations and overloading. Would it be simpler?
+
+  if (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
+    if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+      if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
+        binary_op<T, T>(
+            a,
+            b,
+            outputs[0],
+            outputs[1],
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            DefaultVectorVector<T, T, Op>(op));
+      } else {
+        // opsv and opvs were UseDefaultBinaryOp
+        binary_op<T, T>(
+            a,
+            b,
+            outputs[0],
+            outputs[1],
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            opvv);
+      }
+    } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opsv and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opsv was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          opvv);
+    }
+  } else if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opvs and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opvs was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          outputs[0],
+          outputs[1],
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          opvv);
+    }
+  } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+    // opvv was UseDefaultBinaryOp
+    binary_op<T, T>(
+        a,
+        b,
+        outputs[0],
+        outputs[1],
+        op,
+        opsv,
+        opvs,
+        DefaultVectorVector<T, T, Op>(op));
+  } else {
+    // All ops provided
+    binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
+  }
+}
+
+template <typename T, typename Op>
+void binary_op(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Op op) {
+  DefaultScalarVector<T, T, Op> opsv(op);
+  DefaultVectorScalar<T, T, Op> opvs(op);
+  DefaultVectorVector<T, T, Op> opvv(op);
+  binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
+}
+
+template <typename... Ops>
+void binary(
+    const array& a,
+    const array& b,
+    std::vector<array>& outputs,
+    Ops... ops) {
+  switch (outputs[0].dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, outputs, ops...);
+      break;
+    case uint8:
+      binary_op<uint8_t>(a, b, outputs, ops...);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, outputs, ops...);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, outputs, ops...);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, outputs, ops...);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, outputs, ops...);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, outputs, ops...);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, outputs, ops...);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, outputs, ops...);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, outputs, ops...);
+      break;
+    case float32:
+      binary_op<float>(a, b, outputs, ops...);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, outputs, ops...);
+      break;
+    case complex64:
+      binary_op<complex64_t>(a, b, outputs, ops...);
+      break;
+  }
+}
+
+} // namespace
+
+} // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -0,0 +1,59 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <queue>
+
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+// Build the real tape
+std::pair<std::queue<array>, std::vector<array>> trace_to_real(
+    const std::vector<array>& trace_tape,
+    const std::vector<array>& trace_inputs,
+    const std::vector<array>& trace_outputs,
+    const std::vector<array>& inputs) {
+  std::unordered_map<uintptr_t, array> trace_to_real;
+  for (int i = 0; i < inputs.size(); ++i) {
+    trace_to_real.insert({trace_inputs[i].id(), inputs[i]});
+  }
+  std::queue<array> tape;
+  for (auto& a : trace_tape) {
+    // Find real inputs
+    std::vector<array> real_inputs;
+    for (auto& in : a.inputs()) {
+      real_inputs.push_back(trace_to_real.at(in.id()));
+    }
+    tape.push(
+        array(a.shape(), a.dtype(), a.primitive_ptr(), std::move(real_inputs)));
+    trace_to_real.insert({a.id(), tape.back()});
+  }
+
+  std::vector<array> outputs;
+  for (auto& o : trace_outputs) {
+    outputs.push_back(trace_to_real.at(o.id()));
+  }
+  return {tape, outputs};
+}
+
+void Compiled::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  // Make the a real tape from the tracers
+  auto [tape, real_outputs] = trace_to_real(tape_, inputs_, outputs_, inputs);
+
+  // Run the tape
+  while (!tape.empty()) {
+    auto a = std::move(tape.front());
+    tape.pop();
+    auto outputs = a.outputs();
+    a.primitive().eval_cpu(a.inputs(), outputs);
+    a.detach();
+  }
+
+  // Copy results into outputs
+  for (int o = 0; o < real_outputs.size(); ++o) {
+    outputs[o].copy_shared_buffer(real_outputs[o]);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -3,7 +3,7 @@
 #include <cassert>

 #ifdef ACCELERATE_NEW_LAPACK
-#include <vecLib/cblas_new.h>
+#include <Accelerate/Accelerate.h>
 #else
 #include <cblas.h>
 #endif
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -289,11 +289,16 @@ void copy(const array& src, array& dst, CopyType ctype) {
  // Allocate the output
  switch (ctype) {
    case CopyType::Vector:
-      dst.set_data(
-          allocator::malloc_or_wait(src.data_size() * dst.itemsize()),
-          src.data_size(),
-          src.strides(),
-          src.flags());
+      if (src.is_donatable() && src.itemsize() == dst.itemsize()) {
+        dst.copy_shared_buffer(src);
+      } else {
+        auto size = src.data_size();
+        dst.set_data(
+            allocator::malloc_or_wait(size * dst.itemsize()),
+            size,
+            src.strides(),
+            src.flags());
+      }
      break;
    case CopyType::Scalar:
    case CopyType::General:
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -1,11 +1,13 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #ifdef ACCELERATE_NEW_LAPACK
-#include <vecLib/cblas_new.h>
+#include <Accelerate/Accelerate.h>
 #else
 #include <cblas.h>
 #endif

+#include <cstring>
+
 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
@@ -16,6 +18,12 @@
    primitive::eval(inputs, out);                                          \
  }

+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
 namespace mlx::core {

 DEFAULT(Abs)
@@ -33,12 +41,16 @@ DEFAULT(ArgSort)
 DEFAULT(AsType)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
+DEFAULT_MULTI(DivMod)
 DEFAULT(Ceil)
+DEFAULT_MULTI(Compiled)
 DEFAULT(Concatenate)
 DEFAULT(Convolution)
 DEFAULT(Copy)
 DEFAULT(Cos)
 DEFAULT(Cosh)
+DEFAULT_MULTI(CustomVJP)
+DEFAULT_MULTI(Depends)
 DEFAULT(Divide)
 DEFAULT(Remainder)
 DEFAULT(Equal)
@@ -57,6 +69,8 @@ DEFAULT(Load)
 DEFAULT(Log)
 DEFAULT(Log1p)
 DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
 DEFAULT(LogAddExp)
 DEFAULT(Maximum)
 DEFAULT(Minimum)
@@ -66,6 +80,7 @@ DEFAULT(NotEqual)
 DEFAULT(Pad)
 DEFAULT(Partition)
 DEFAULT(Power)
+DEFAULT_MULTI(QRF)
 DEFAULT(QuantizedMatmul)
 DEFAULT(RandomBits)
 DEFAULT(Reduce)
@@ -80,6 +95,7 @@ DEFAULT(Sinh)
 DEFAULT(Slice)
 DEFAULT(Softmax)
 DEFAULT(Sort)
+DEFAULT_MULTI(Split)
 DEFAULT(Square)
 DEFAULT(Sqrt)
 DEFAULT(StopGradient)
@@ -88,16 +104,14 @@ DEFAULT(Tan)
 DEFAULT(Tanh)
 DEFAULT(Transpose)

-void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  if (out.dtype() != float32) {
-    throw std::runtime_error(
-        "[Matmul::eval_cpu] Currently only supports float32.");
-  }
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-
-  auto& a_pre = inputs[0];
-  auto& b_pre = inputs[1];
+namespace {

+inline void matmul_common_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
  auto check_transpose = [](const array& arr) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
@@ -115,9 +129,17 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
-  int M = a.shape(-2);
-  int N = b.shape(-1);
-  int K = a.shape(-1);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
  for (int i = 0; i < (a.size() / (M * K)); ++i) {
    cblas_sgemm(
        CblasRowMajor,
@@ -126,16 +148,41 @@ void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
        M,
        N,
        K,
-        1.0f, // alpha
+        alpha, // alpha
        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
        lda,
        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
        ldb,
-        0.0f, // beta
+        beta, // beta
        out.data<float>() + M * N * i,
        out.shape(-1) // ldc
    );
  }
 }

+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[Matmul::eval_cpu] Currently only supports float32.");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_common_general(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[AddMM::eval_cpu] Currently only supports float32.");
+  }
+
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  return matmul_common_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -5,7 +5,7 @@
 #include <utility>

 #include "mlx/allocator.h"
-#include "mlx/load.h"
+#include "mlx/io/load.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -13,7 +13,7 @@ namespace mlx::core {
 namespace {

 template <const uint8_t scalar_size>
-void swap_endianess(uint8_t* data_bytes, size_t N) {
+void swap_endianness(uint8_t* data_bytes, size_t N) {
  struct Elem {
    uint8_t bytes[scalar_size];
  };
@@ -39,13 +39,13 @@ void Load::eval(const std::vector<array>& inputs, array& out) {
  if (swap_endianness_) {
    switch (out.itemsize()) {
      case 2:
-        swap_endianess<2>(out.data<uint8_t>(), out.data_size());
+        swap_endianness<2>(out.data<uint8_t>(), out.data_size());
        break;
      case 4:
-        swap_endianess<4>(out.data<uint8_t>(), out.data_size());
+        swap_endianness<4>(out.data<uint8_t>(), out.data_size());
        break;
      case 8:
-        swap_endianess<8>(out.data<uint8_t>(), out.data_size());
+        swap_endianness<8>(out.data<uint8_t>(), out.data_size());
        break;
    }
  }
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -8,6 +8,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/arange.h"
+#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/erf.h"
 #include "mlx/backend/common/threefry.h"
@@ -231,22 +232,38 @@ void Cosh::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void CustomVJP::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
+       i++, j++) {
+    outputs[i].copy_shared_buffer(inputs[j]);
+  }
+}
+
+void Depends::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() > outputs.size());
+  for (int i = 0; i < outputs.size(); i++) {
+    outputs[i].copy_shared_buffer(inputs[i]);
+  }
+}
+
 void Erf::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  switch (out.dtype()) {
    case float32:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float>(in, out, [](auto x) { return std::erf(x); });
      break;
    case float16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float16_t>(in, out, [](auto x) {
        return static_cast<float16_t>(std::erf(static_cast<float>(x)));
      });
      break;
    case bfloat16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<bfloat16_t>(in, out, [](auto x) {
        return static_cast<bfloat16_t>(std::erf(static_cast<float>(x)));
      });
@@ -263,17 +280,14 @@ void ErfInv::eval(const std::vector<array>& inputs, array& out) {
  const auto& in = inputs[0];
  switch (out.dtype()) {
    case float32:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float>(in, out, [](auto x) { return erfinv(x); });
      break;
    case float16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<float16_t>(in, out, [](auto x) {
        return static_cast<float16_t>(erfinv(static_cast<float>(x)));
      });
      break;
    case bfloat16:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      unary_op<bfloat16_t>(in, out, [](auto x) {
        return static_cast<bfloat16_t>(erfinv(static_cast<float>(x)));
      });
@@ -364,6 +378,20 @@ void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
  unary(in, out, [](auto x) { return !x; });
 }

+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, [](auto x, auto y) { return x && y; });
+}
+
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2); // LogicalOr requires two input arrays
+  auto& in1 = inputs[0];
+  auto& in2 = inputs[1];
+  binary(in1, in2, out, [](auto x, auto y) { return x || y; });
+}
+
 void Negative::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -573,6 +601,58 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  out.copy_shared_buffer(in, strides, flags, data_size, data_offset);
 }

+void Split::eval(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  assert(inputs.size() == 1);
+
+  auto& in = inputs[0];
+
+  auto compute_new_flags = [](const auto& shape,
+                              const auto& strides,
+                              size_t in_data_size,
+                              auto flags) {
+    size_t data_size = 1;
+    size_t f_stride = 1;
+    size_t b_stride = 1;
+    flags.row_contiguous = true;
+    flags.col_contiguous = true;
+    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
+      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
+      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
+      f_stride *= shape[i];
+      b_stride *= shape[ri];
+      if (strides[i] > 0) {
+        data_size *= shape[i];
+      }
+    }
+
+    if (data_size == 1) {
+      // Broadcasted scalar array is contiguous.
+      flags.contiguous = true;
+    } else if (data_size == in_data_size) {
+      // Means we sliced a broadcasted dimension so leave the "no holes" flag
+      // alone.
+    } else {
+      // We sliced something. So either we are row or col contiguous or we
+      // punched a hole.
+      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
+    }
+
+    return std::pair<decltype(flags), size_t>{flags, data_size};
+  };
+
+  std::vector<int> indices(1, 0);
+  indices.insert(indices.end(), indices_.begin(), indices_.end());
+  for (int i = 0; i < indices.size(); i++) {
+    size_t offset = indices[i] * in.strides()[axis_];
+    auto [new_flags, data_size] = compute_new_flags(
+        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
+    outputs[i].copy_shared_buffer(
+        in, in.strides(), new_flags, data_size, offset);
+  }
+}
+
 void Square::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -0,0 +1,153 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+
+#ifdef ACCELERATE_NEW_LAPACK
+#include <Accelerate/Accelerate.h>
+#else
+#include <lapack.h>
+#endif
+
+namespace mlx::core {
+
+template <typename T>
+struct lpack;
+
+template <>
+struct lpack<float> {
+  static void xgeqrf(
+      const int* m,
+      const int* n,
+      float* a,
+      const int* lda,
+      float* tau,
+      float* work,
+      const int* lwork,
+      int* info) {
+    sgeqrf_(m, n, a, lda, tau, work, lwork, info);
+  }
+  static void xorgqr(
+      const int* m,
+      const int* n,
+      const int* k,
+      float* a,
+      const int* lda,
+      const float* tau,
+      float* work,
+      const int* lwork,
+      int* info) {
+    sorgqr_(m, n, k, a, lda, tau, work, lwork, info);
+  }
+};
+
+template <typename T>
+void qrf_impl(const array& a, array& q, array& r) {
+  const int M = a.shape(-2);
+  const int N = a.shape(-1);
+  const int lda = std::max(M, N);
+  size_t num_matrices = a.size() / (M * N);
+  int num_reflectors = std::min(M, N);
+  auto tau =
+      allocator::malloc_or_wait(sizeof(T) * num_matrices * num_reflectors);
+
+  // Copy A to inplace input and make it col-contiguous
+  array in(a.shape(), float32, nullptr, {});
+  auto flags = in.flags();
+
+  // Copy the input to be column contiguous
+  flags.col_contiguous = num_matrices == 1;
+  flags.row_contiguous = false;
+  std::vector<size_t> strides = in.strides();
+  strides[in.ndim() - 2] = 1;
+  strides[in.ndim() - 1] = M;
+  in.set_data(
+      allocator::malloc_or_wait(in.nbytes()), in.nbytes(), strides, flags);
+  copy_inplace(a, in, CopyType::GeneralGeneral);
+
+  T optimal_work;
+  int lwork = -1;
+  int info;
+
+  // Compute workspace size
+  lpack<T>::xgeqrf(
+      &M, &N, nullptr, &lda, nullptr, &optimal_work, &lwork, &info);
+
+  // Update workspace size
+  lwork = optimal_work;
+  auto work = allocator::malloc_or_wait(sizeof(T) * lwork);
+
+  // Loop over matrices
+  for (int i = 0; i < num_matrices; ++i) {
+    // Solve
+    lpack<T>::xgeqrf(
+        &M,
+        &N,
+        in.data<float>() + M * N * i,
+        &lda,
+        static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
+        static_cast<T*>(work.raw_ptr()),
+        &lwork,
+        &info);
+  }
+  allocator::free(work);
+
+  r.set_data(allocator::malloc_or_wait(r.nbytes()));
+  copy_inplace(in, r, CopyType::General);
+
+  for (int i = 0; i < num_matrices; ++i) {
+    // Zero lower triangle
+    for (int j = 0; j < r.shape(-2); ++j) {
+      for (int k = 0; k < j; ++k) {
+        r.data<T>()[i * N * M + j * N + k] = 0;
+      }
+    }
+  }
+
+  // Get work size
+  lwork = -1;
+  lpack<T>::xorgqr(
+      &M,
+      &N,
+      &num_reflectors,
+      nullptr,
+      &lda,
+      nullptr,
+      &optimal_work,
+      &lwork,
+      &info);
+  lwork = optimal_work;
+  work = allocator::malloc_or_wait(sizeof(T) * lwork);
+
+  // Loop over matrices
+  for (int i = 0; i < num_matrices; ++i) {
+    // Compute Q
+    lpack<T>::xorgqr(
+        &M,
+        &N,
+        &num_reflectors,
+        in.data<float>() + M * N * i,
+        &lda,
+        static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
+        static_cast<T*>(work.raw_ptr()),
+        &lwork,
+        &info);
+  }
+
+  q.set_data(allocator::malloc_or_wait(q.nbytes()));
+  copy_inplace(in, q, CopyType::General);
+
+  // Cleanup
+  allocator::free(work);
+  allocator::free(tau);
+}
+
+void QRF::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
+  if (!(inputs[0].dtype() == float32)) {
+    throw std::runtime_error("[QRF::eval] only supports float32.");
+  }
+  qrf_impl<float>(inputs[0], outputs[0], outputs[1]);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -2,12 +2,60 @@

 #include <cassert>

+#include "mlx/backend/metal/copy.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 namespace {

+template <typename T, int bits, int group_size>
+void _qmm(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const T* scales,
+    const T* biases,
+    int M,
+    int N,
+    int K) {
+  constexpr int bitmask = (1 << bits) - 1;
+  constexpr int pack_factor = 32 / bits;
+  constexpr int packs_in_group = group_size / pack_factor;
+  const int Ng = N / group_size;
+  const int Nw = N / pack_factor;
+
+  for (int m = 0; m < M; m++) {
+    const uint32_t* w_local = w;
+    const T* scales_local = scales;
+    const T* biases_local = biases;
+
+    std::fill(result, result + N, 0);
+
+    for (int k = 0; k < K; k++) {
+      T* result_local = result;
+      T xi = *x++;
+
+      for (int n = 0; n < N; n += group_size) {
+        T scale = *scales_local++;
+        T bias = *biases_local++;
+        for (int ng = 0; ng < packs_in_group; ng++) {
+          uint32_t wi = *w_local++;
+
+#pragma clang loop unroll(full)
+          for (int p = 0; p < pack_factor; p++) {
+            (*result_local++) +=
+                xi * (scale * static_cast<T>(wi & bitmask) + bias);
+            wi >>= bits;
+          }
+        }
+      }
+    }
+
+    result += N;
+  }
+}
+
 template <typename T, int bits, int group_size>
 void _qmm_t(
    T* result,
@@ -55,7 +103,7 @@ void _qmm_t(
 }

 template <typename T>
-void _qmm_t_dispatch_typed(
+void _qmm_dispatch_typed(
    T* result,
    const T* x,
    const uint32_t* w,
@@ -65,30 +113,73 @@ void _qmm_t_dispatch_typed(
    int N,
    int K,
    int group_size,
-    int bits) {
+    int bits,
+    bool transposed_w) {
  switch (bits) {
    case 2: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 2, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 2, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
-          return _qmm_t<T, 2, 64>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 2, 64>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 2, 64>(result, x, w, scales, biases, M, N, K);
+          }
        case 128:
-          return _qmm_t<T, 2, 128>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 2, 128>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 2, 128>(result, x, w, scales, biases, M, N, K);
+          }
      }
    }
    case 4: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 4, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 4, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
-          return _qmm_t<T, 4, 64>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 4, 64>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 4, 64>(result, x, w, scales, biases, M, N, K);
+          }
        case 128:
-          return _qmm_t<T, 4, 128>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 4, 128>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 4, 128>(result, x, w, scales, biases, M, N, K);
+          }
      }
    }
    case 8: {
      switch (group_size) {
+        case 32:
+          if (transposed_w) {
+            return _qmm_t<T, 8, 32>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 8, 32>(result, x, w, scales, biases, M, N, K);
+          }
        case 64:
-          return _qmm_t<T, 8, 64>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 8, 64>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 8, 64>(result, x, w, scales, biases, M, N, K);
+          }
        case 128:
-          return _qmm_t<T, 8, 128>(result, x, w, scales, biases, M, N, K);
+          if (transposed_w) {
+            return _qmm_t<T, 8, 128>(result, x, w, scales, biases, M, N, K);
+          } else {
+            return _qmm<T, 8, 128>(result, x, w, scales, biases, M, N, K);
+          }
      }
    }
  }
@@ -100,21 +191,22 @@ void _qmm_t_dispatch_typed(
  throw std::invalid_argument(msg.str());
 }

-void _qmm_t_dispatch(
+void _qmm_dispatch(
    array out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    int bits,
-    int group_size) {
+    int group_size,
+    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.size() / K;
-  int N = w.shape(1);
+  int N = out.shape(-1);

  switch (x.dtype()) {
    case float32:
-      _qmm_t_dispatch_typed<float>(
+      _qmm_dispatch_typed<float>(
          out.data<float>(),
          x.data<float>(),
          w.data<uint32_t>(),
@@ -124,10 +216,11 @@ void _qmm_t_dispatch(
          N,
          K,
          bits,
-          group_size);
+          group_size,
+          transposed_w);
      break;
    case float16:
-      _qmm_t_dispatch_typed<float16_t>(
+      _qmm_dispatch_typed<float16_t>(
          out.data<float16_t>(),
          x.data<float16_t>(),
          w.data<uint32_t>(),
@@ -137,10 +230,11 @@ void _qmm_t_dispatch(
          N,
          K,
          bits,
-          group_size);
+          group_size,
+          transposed_w);
      break;
    case bfloat16:
-      _qmm_t_dispatch_typed<bfloat16_t>(
+      _qmm_dispatch_typed<bfloat16_t>(
          out.data<bfloat16_t>(),
          x.data<bfloat16_t>(),
          w.data<uint32_t>(),
@@ -150,7 +244,8 @@ void _qmm_t_dispatch(
          N,
          K,
          bits,
-          group_size);
+          group_size,
+          transposed_w);
      break;
    default:
      throw std::invalid_argument(
@@ -163,22 +258,28 @@ void _qmm_t_dispatch(
 void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 4);

-  auto& x = inputs[0];
-  auto& w = inputs[1];
-  auto& scales = inputs[2];
-  auto& biases = inputs[3];
+  auto& x_pre = inputs[0];
+  auto& w_pre = inputs[1];
+  auto& scales_pre = inputs[2];
+  auto& biases_pre = inputs[3];

-  if (w.strides()[0] != 1) {
-    throw std::runtime_error("The quantized weight should be transposed");
-  }
+  auto ensure_row_contiguous = [](const array& arr) {
+    if (arr.flags().row_contiguous) {
+      return arr;
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      return arr_copy;
+    }
+  };

-  if (!x.flags().row_contiguous || !scales.flags().row_contiguous ||
-      !biases.flags().row_contiguous) {
-    throw std::runtime_error("x, scales and biases should be row contiguous.");
-  }
+  auto x = ensure_row_contiguous(x_pre);
+  auto w = ensure_row_contiguous(w_pre);
+  auto scales = ensure_row_contiguous(scales_pre);
+  auto biases = ensure_row_contiguous(biases_pre);

  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  _qmm_t_dispatch(out, x, w, scales, biases, group_size_, bits_);
+  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -126,7 +126,7 @@ struct ReductionPlan {
 ReductionPlan get_reduction_plan(const array& x, const std::vector<int> axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
-      (x.flags().row_contiguous || x.flags().col_contiguous)) {
+      x.flags().contiguous) {
    return ContiguousAllReduce;
  }

--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -56,23 +56,32 @@ struct SignOp {
 struct RoundOp {
  template <typename T>
  T operator()(T x) {
-    return std::round(x);
+    return std::rint(x);
  }

  complex64_t operator()(complex64_t x) {
-    return {std::round(x.real()), std::round(x.imag())};
+    return {std::rint(x.real()), std::rint(x.imag())};
  }
 };

+void set_unary_output_data(const array& in, array& out) {
+  if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+    out.copy_shared_buffer(in);
+  } else {
+    auto size = in.data_size();
+    out.set_data(
+        allocator::malloc_or_wait(size * out.itemsize()),
+        size,
+        in.strides(),
+        in.flags());
+  }
+}
+
 template <typename T, typename Op>
 void unary_op(const array& a, array& out, Op op) {
  const T* a_ptr = a.data<T>();
  if (a.flags().contiguous) {
-    out.set_data(
-        allocator::malloc_or_wait(a.data_size() * out.itemsize()),
-        a.data_size(),
-        a.strides(),
-        a.flags());
+    set_unary_output_data(a, out);
    T* dst = out.data<T>();
    for (size_t i = 0; i < a.data_size(); ++i) {
      dst[i] = op(a_ptr[i]);
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -1,7 +1,28 @@
+add_custom_command(
+    OUTPUT  compiled_preamble.cpp
+    COMMAND /bin/bash
+              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
+              ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
+              ${CMAKE_C_COMPILER}
+              ${CMAKE_SOURCE_DIR}
+    DEPENDS make_compiled_preamble.sh
+            kernels/compiled_preamble.h
+            kernels/unary.h
+            kernels/binary.h
+)
+
+add_custom_target(
+  compiled_preamble
+  DEPENDS compiled_preamble.cpp
+)
+
+add_dependencies(mlx compiled_preamble)
+
 target_sources(
  mlx
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
@@ -15,6 +36,7 @@ target_sources(
  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
 )

 if (NOT MLX_METAL_PATH)
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -23,16 +23,23 @@ void* Buffer::raw_ptr() {

 namespace metal {

+static bool cache_enabled_ = true;
+
+bool cache_enabled() {
+  return cache_enabled_;
+}
+
+void set_cache_enabled(bool enabled) {
+  cache_enabled_ = enabled;
+}
+
 namespace {

 BufferCache::BufferCache(MTL::Device* device)
-    : device_(device),
-      head_(nullptr),
-      tail_(nullptr),
-      pool_size_(0),
-      gc_limit_(0.95 * device_->recommendedMaxWorkingSetSize()) {}
+    : device_(device), head_(nullptr), tail_(nullptr), pool_size_(0) {}

 BufferCache::~BufferCache() {
+  auto thread_pool = metal::new_scoped_memory_pool();
  clear();
 }

@@ -54,12 +61,16 @@ MTL::Buffer* BufferCache::reuse_from_cache(size_t size) {

  // Find the closest buffer in pool
  MTL::Buffer* pbuf = nullptr;
+
+  // Make sure we use most of the available memory
  auto it = buffer_pool_.lower_bound(size);

-  // Make sure we use > 50% of the available memory
-  while (!pbuf && it != buffer_pool_.end() && it->first < 2 * size) {
+  // Make sure we use most of the available memory
+  while (!pbuf && it != buffer_pool_.end() &&
+         it->first < std::min(2 * size, size + 2 * vm_page_size)) {
    // Collect from the cache
    pbuf = it->second->buf;
+
    // Remove from cache
    remove_from_list(it->second);
    delete it->second;
@@ -85,13 +96,9 @@ void BufferCache::recycle_to_cache(MTL::Buffer* buf) {
  }
 }

-size_t BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
-  min_bytes_to_free += device_->currentAllocatedSize() - gc_limit_;
-
+void BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
  if (min_bytes_to_free >= 0.9 * pool_size_) {
-    size_t old_pool_size = pool_size_;
    clear();
-    return old_pool_size;
  } else {
    std::lock_guard<std::mutex> lk(cache_mutex_);
    size_t total_bytes_freed = 0;
@@ -104,9 +111,7 @@ size_t BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
      }
      remove_from_list(tail_);
    }
-
    pool_size_ -= total_bytes_freed;
-    return total_bytes_freed;
  }
 }

@@ -125,8 +130,9 @@ void BufferCache::add_at_head(BufferCache::BufferHolder* to_add) {
 }

 void BufferCache::remove_from_list(BufferCache::BufferHolder* to_remove) {
-  if (!to_remove)
+  if (!to_remove) {
    return;
+  }

  // If in the middle
  if (to_remove->prev && to_remove->next) {
@@ -153,26 +159,37 @@ MetalAllocator::MetalAllocator()
    : device_(device(mlx::core::Device::gpu).mtl_device()),
      buffer_cache_(device_),
      peak_allocated_size_(0),
-      block_limit_(1.5 * device_->recommendedMaxWorkingSetSize()) {}
+      block_limit_(1.5 * device_->recommendedMaxWorkingSetSize()),
+      gc_limit_(0.95 * device_->recommendedMaxWorkingSetSize()) {}
+
+Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
+  // Metal doesn't like empty buffers
+  if (size == 0) {
+    return Buffer{nullptr};
+  }

-Buffer MetalAllocator::malloc(size_t size) {
  // Align up memory
  if (size > vm_page_size) {
    size = vm_page_size * ((size + vm_page_size - 1) / vm_page_size);
  }

+  // Try the cache
  MTL::Buffer* buf = buffer_cache_.reuse_from_cache(size);

-  // Prepare to allocate new memory as needed
  if (!buf) {
-    // If we are under very high memoory pressure, we don't allocate further
-    if (device_->currentAllocatedSize() >= block_limit_) {
+    // If there is too much memory pressure, fail (likely causes a wait).
+    if (!allow_swap && device_->currentAllocatedSize() + size >= block_limit_) {
      return Buffer{nullptr};
    }

-    // If we are still under memory pressure, try cleaning cache
-    if (buffer_cache_.can_garbage_collect()) {
-      buffer_cache_.release_cached_buffers(size);
+    auto thread_pool = metal::new_scoped_memory_pool();
+
+    // If we have a lot of memory pressure, check if we can reclaim some memory
+    // from the cache
+    if (device_->currentAllocatedSize() + size >= gc_limit_) {
+      size_t min_bytes_to_free =
+          size + device_->currentAllocatedSize() - gc_limit_;
+      buffer_cache_.release_cached_buffers(min_bytes_to_free);
    }

    // Allocate new buffer if needed
@@ -189,7 +206,11 @@ Buffer MetalAllocator::malloc(size_t size) {

 void MetalAllocator::free(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
-  buffer_cache_.recycle_to_cache(buf);
+  if (cache_enabled()) {
+    buffer_cache_.recycle_to_cache(buf);
+  } else {
+    buf->release();
+  }
 }

 MetalAllocator& allocator() {
--- a/mlx/backend/metal/allocator.h
+++ b/mlx/backend/metal/allocator.h
@@ -23,11 +23,7 @@ class BufferCache {

  MTL::Buffer* reuse_from_cache(size_t size);
  void recycle_to_cache(MTL::Buffer* buf);
-  size_t release_cached_buffers(size_t min_bytes_to_free);
-
-  bool can_garbage_collect() {
-    return pool_size_ > 0 && device_->currentAllocatedSize() > gc_limit_;
-  }
+  void release_cached_buffers(size_t min_bytes_to_free);

 private:
  struct BufferHolder {
@@ -49,7 +45,6 @@ class BufferCache {
  BufferHolder* head_;
  BufferHolder* tail_;
  size_t pool_size_;
-  size_t gc_limit_;
 };

 } // namespace
@@ -57,7 +52,7 @@ class BufferCache {
 class MetalAllocator : public allocator::Allocator {
  /** Allocator for Metal GPUs. */
 public:
-  virtual Buffer malloc(size_t size) override;
+  virtual Buffer malloc(size_t size, bool allow_swap = false) override;
  virtual void free(Buffer buffer) override;

 private:
@@ -71,6 +66,7 @@ class MetalAllocator : public allocator::Allocator {
  // Allocation stats
  size_t peak_allocated_size_;
  size_t block_limit_;
+  size_t gc_limit_;
 };

 MetalAllocator& allocator();
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -0,0 +1,484 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <sstream>
+
+#include "mlx/backend/metal/compiled_preamble.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/graph_utils.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+inline bool is_static_cast(const Primitive& p) {
+  return (
+      typeid(p) == typeid(Broadcast) || typeid(p) == typeid(Copy) ||
+      typeid(p) == typeid(StopGradient) || typeid(p) == typeid(AsType));
+}
+
+inline auto get_type_string(Dtype d) {
+  switch (d) {
+    case float32:
+      return "float";
+    case float16:
+      return "half";
+    case bfloat16:
+      return "bfloat16_t";
+    case bool_:
+      return "bool";
+    case int8:
+      return "int8_t";
+    case int16:
+      return "int16_t";
+    case int32:
+      return "int32_t";
+    case int64:
+      return "int64_t";
+    case uint8:
+      return "uint8_t";
+    case uint16:
+      return "uint16_t";
+    case uint32:
+      return "uint32_t";
+    case uint64:
+      return "uint64_t";
+    default: {
+      std::ostringstream msg;
+      msg << "Unsupported compilation type " << d;
+      throw std::runtime_error(msg.str());
+    }
+  }
+}
+
+template <typename T>
+void print_float_constant(std::ostream& os, const array& x) {
+  auto old_precision = os.precision();
+  os << std::setprecision(std::numeric_limits<float>::digits10 + 1)
+     << x.item<T>() << std::setprecision(old_precision);
+}
+
+template <typename T>
+void print_int_constant(std::ostream& os, const array& x) {
+  os << x.item<T>();
+}
+
+void print_constant(std::ostream& os, const array& x) {
+  switch (x.dtype()) {
+    case float32:
+      return print_float_constant<float>(os, x);
+    case float16:
+      return print_float_constant<float16_t>(os, x);
+    case bfloat16:
+      return print_float_constant<bfloat16_t>(os, x);
+    case int8:
+      return print_int_constant<int8_t>(os, x);
+    case int16:
+      return print_int_constant<int16_t>(os, x);
+    case int32:
+      return print_int_constant<int32_t>(os, x);
+    case int64:
+      return print_int_constant<int64_t>(os, x);
+    case uint8:
+      return print_int_constant<uint8_t>(os, x);
+    case uint16:
+      return print_int_constant<uint16_t>(os, x);
+    case uint32:
+      return print_int_constant<uint32_t>(os, x);
+    case uint64:
+      return print_int_constant<uint64_t>(os, x);
+    case bool_:
+      os << std::boolalpha << x.item<bool>();
+      return;
+    default:
+      throw std::runtime_error("Unsupported constant type");
+  }
+}
+
+inline std::string build_lib_name(
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    const std::vector<array>& tape,
+    const std::unordered_set<uintptr_t>& constant_ids) {
+  std::ostringstream os;
+  std::ostringstream constant_hasher;
+
+  // The primitives describing the tape. For unary and binary primitives this
+  // must be enough to describe the full computation.
+  for (auto& a : tape) {
+    a.primitive().print(os);
+  }
+  os << "_";
+
+  for (auto& x : inputs) {
+    if (constant_ids.find(x.id()) != constant_ids.end()) {
+      os << "C";
+      print_constant(constant_hasher, x);
+    } else {
+      os << ((x.size() == 1) ? "S" : "V");
+    }
+  }
+  os << "_";
+  for (auto& x : inputs) {
+    if (constant_ids.find(x.id()) != constant_ids.end()) {
+      continue;
+    }
+    os << kindof(x.dtype()) << x.itemsize();
+  }
+  os << "_" << std::hash<std::string>{}(constant_hasher.str());
+
+  return os.str();
+}
+
+inline void build_kernel(
+    std::ostream& os,
+    const std::string& kernel_name,
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs,
+    const std::vector<array>& tape,
+    const std::unordered_set<uintptr_t>& constant_ids,
+    bool contiguous,
+    int ndim,
+    bool dynamic_dims) {
+  // All outputs should have the exact same shape and will be row contiguous
+  auto output_shape = outputs[0].shape();
+  auto output_strides = outputs[0].strides();
+
+  // Constants are scalars that are captured by value and cannot change
+  auto is_constant = [&constant_ids](const array& x) {
+    return constant_ids.find(x.id()) != constant_ids.end();
+  };
+
+  // For scalar we shouldn't do the indexing things, just read at 0
+  auto is_scalar = [](const array& x) { return x.size() == 1; };
+
+  NodeNamer namer;
+  bool add_indices = false;
+  int cnt = 0;
+
+  // Start the kernel
+  os << "[[host_name(\"" << kernel_name << "\")]]" << std::endl
+     << "[[kernel]] void " << kernel_name << "(" << std::endl;
+
+  // Add the input arguments
+  for (auto& x : inputs) {
+    auto& xname = namer.get_name(x);
+
+    // Skip constants from the input list
+    if (is_constant(x)) {
+      continue;
+    }
+
+    // Scalars and contiguous need no strides
+    if (is_scalar(x) || contiguous) {
+      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
+         << " [[buffer(" << cnt++ << ")]]," << std::endl;
+    } else {
+      add_indices = true;
+      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
+         << " [[buffer(" << cnt++ << ")]]," << std::endl
+         << "    constant const size_t* " << xname << "_strides [[buffer("
+         << cnt++ << ")]]," << std::endl;
+    }
+  }
+
+  // Add the output arguments
+  for (auto& x : outputs) {
+    os << "    device " << get_type_string(x.dtype()) << "* "
+       << namer.get_name(x) << " [[buffer(" << cnt++ << ")]]," << std::endl;
+  }
+  // Add output strides and shape to extract the indices.
+  if (!contiguous) {
+    os << "    constant const size_t* output_strides [[buffer(" << cnt++
+       << ")]]," << std::endl
+       << "    constant const int* output_shape [[buffer(" << cnt++ << ")]],"
+       << std::endl;
+  }
+  if (dynamic_dims) {
+    os << "    constant const int& ndim [[buffer(" << cnt++ << ")]],"
+       << std::endl;
+  }
+
+  // The thread index in the whole grid
+  os << "    uint3 pos [[thread_position_in_grid]]," << std::endl
+     << "    uint3 grid [[threads_per_grid]]) {" << std::endl
+     << "  uint index = pos.x + grid.x * (pos.y + grid.y * pos.z);"
+     << std::endl;
+
+  // Extract the indices per axis to individual uints if we have arrays that
+  // are broadcasted or transposed
+  if (add_indices) {
+    if (!dynamic_dims) {
+      if (ndim == 1) {
+        os << "  uint index_0 = pos.x;" << std::endl;
+      } else if (ndim == 2) {
+        os << "  uint index_0 = pos.y;" << std::endl
+           << "  uint index_1 = pos.x;" << std::endl;
+      } else if (ndim == 3) {
+        os << "  uint index_0 = pos.z;" << std::endl
+           << "  uint index_1 = pos.y;" << std::endl
+           << "  uint index_2 = pos.x;" << std::endl;
+      } else {
+        for (int i = 0; i < ndim - 2; i++) {
+          os << "  uint index_" << i << " = (index / uint(output_strides[" << i
+             << "])) % output_shape[" << i << "];" << std::endl;
+        }
+        os << "  uint index_" << ndim - 2 << " = pos.y;" << std::endl
+           << "  uint index_" << ndim - 1 << " = pos.x;" << std::endl;
+      }
+    }
+  }
+
+  // Read the inputs in tmps
+  for (auto& x : inputs) {
+    auto& xname = namer.get_name(x);
+
+    if (is_constant(x)) {
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
+      print_constant(os, x);
+      os << ";" << std::endl;
+    } else if (is_scalar(x)) {
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
+         << xname << "[0];" << std::endl;
+    } else if (contiguous) {
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
+         << xname << "[index];" << std::endl;
+    } else if (!dynamic_dims) {
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
+         << xname << "[";
+      os << "index_0 * " << xname << "_strides[0]";
+      for (int i = 1; i < ndim; i++) {
+        os << " + index_" << i << " * " << xname << "_strides[" << i << "]";
+      }
+      os << "];" << std::endl;
+    } else {
+      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
+         << xname << "[elem_to_loc(index, output_shape, " << xname
+         << "_strides, ndim)];" << std::endl;
+    }
+  }
+
+  // Actually write the computation
+  for (auto& x : tape) {
+    os << "  " << get_type_string(x.dtype()) << " tmp_" << namer.get_name(x)
+       << " = ";
+    if (is_static_cast(x.primitive())) {
+      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
+         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
+    } else {
+      x.primitive().print(os);
+      os << "()(";
+      for (int i = 0; i < x.inputs().size() - 1; i++) {
+        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
+      }
+      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
+    }
+  }
+
+  // Write the outputs from tmps
+  for (auto& x : outputs) {
+    os << "  " << namer.get_name(x) << "[index] = tmp_" << namer.get_name(x)
+       << ";" << std::endl;
+  }
+
+  // Finish the kernel
+  os << "}" << std::endl;
+
+  if (cnt > 31) {
+    std::ostringstream msg;
+    msg << "[compile] Too many inputs/outputs fused in the Metal Compile "
+        << "primitive which exhausted the available argument buffers for "
+        << "the kernel. Please file an issue with the function that results "
+        << "in this error. The name of the kernel is '" << kernel_name << "'";
+    throw std::runtime_error(msg.str());
+  }
+}
+
+void Compiled::eval_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  // Make the name for the kernel library
+  if (kernel_lib_.empty()) {
+    kernel_lib_ = build_lib_name(inputs_, outputs_, tape_, constant_ids_);
+  }
+
+  // Get the kernel if someone else built it already
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+  auto lib = d.get_library(kernel_lib_);
+
+  // If not we have to build it ourselves
+  if (lib == nullptr) {
+    std::ostringstream kernel;
+    kernel << metal::get_kernel_preamble() << std::endl;
+    build_kernel(
+        kernel,
+        kernel_lib_ + "_contiguous",
+        inputs_,
+        outputs_,
+        tape_,
+        constant_ids_,
+        /* contiguous = */ true,
+        /* ndim = */ 0,
+        /* dynamic_dims = */ false);
+    for (int i = 1; i < 8; i++) {
+      build_kernel(
+          kernel,
+          kernel_lib_ + "_strided_" + std::to_string(i),
+          inputs_,
+          outputs_,
+          tape_,
+          constant_ids_,
+          /* contiguous = */ false,
+          /* ndim = */ i,
+          /* dynamic_dims = */ false);
+    }
+    build_kernel(
+        kernel,
+        kernel_lib_ + "_strided_dynamic",
+        inputs_,
+        outputs_,
+        tape_,
+        constant_ids_,
+        /* contiguous = */ false,
+        /* ndim = */ 0,
+        /* dynamic_dims = */ true);
+
+    kernel_source_ = kernel.str();
+    lib = d.get_library(kernel_lib_, kernel_source_);
+  }
+
+  // Allocate space for the outputs
+  for (auto& out : outputs) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  }
+
+  // Figure out which kernel we are using
+  auto& output_shape = outputs[0].shape();
+  bool contiguous = true;
+  for (auto& x : inputs) {
+    if ((!x.flags().row_contiguous || x.shape() != output_shape) &&
+        x.size() > 1) {
+      contiguous = false;
+      break;
+    }
+  }
+
+  // Collapse contiguous dims to route to a faster kernel if possible. Also
+  // handle all broadcasting.
+  std::vector<std::vector<size_t>> initial_strides;
+  initial_strides.push_back(outputs[0].strides());
+  std::vector<int> shape;
+  std::vector<std::vector<size_t>> strides;
+  if (!contiguous) {
+    for (int i = 0; i < inputs.size(); i++) {
+      // Skip constants.
+      if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+        continue;
+      }
+      auto& x = inputs[i];
+
+      // Skip scalar inputs.
+      if (x.size() <= 1) {
+        continue;
+      }
+
+      // Broadcast the inputs to the output shape.
+      std::vector<size_t> xstrides;
+      int j = 0;
+      for (; j < output_shape.size() - x.ndim(); j++) {
+        if (output_shape[j] == 1) {
+          xstrides.push_back(outputs[0].strides()[j]);
+        } else {
+          xstrides.push_back(0);
+        }
+      }
+      for (int i = 0; i < x.ndim(); i++, j++) {
+        if (x.shape(i) == 1) {
+          if (output_shape[j] == 1) {
+            xstrides.push_back(outputs[0].strides()[j]);
+          } else {
+            xstrides.push_back(0);
+          }
+        } else {
+          xstrides.push_back(x.strides()[i]);
+        }
+      }
+      initial_strides.push_back(std::move(xstrides));
+    }
+    std::tie(shape, strides) =
+        collapse_contiguous_dims(output_shape, initial_strides);
+  }
+
+  // Get the kernel from the lib
+  int ndim = shape.size();
+  bool dynamic = ndim >= 8;
+  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
+  if (!contiguous) {
+    if (dynamic) {
+      kernel_name += "dynamic";
+    } else {
+      kernel_name += std::to_string(shape.size());
+    }
+  }
+  auto kernel = d.get_kernel(kernel_name, lib);
+  auto compute_encoder = d.get_command_encoder(s.index);
+  compute_encoder->setComputePipelineState(kernel);
+
+  // Put the inputs in
+  int cnt = 0;
+  int stride_idx = 1; // idx 0 is the output strides
+  for (int i = 0; i < inputs.size(); i++) {
+    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
+      continue;
+    }
+    auto& x = inputs[i];
+    set_array_buffer(compute_encoder, x, cnt++);
+    if (!contiguous && x.size() > 1) {
+      compute_encoder->setBytes(
+          strides[stride_idx].data(),
+          strides[stride_idx].size() * sizeof(size_t),
+          cnt++);
+      stride_idx++;
+    }
+  }
+
+  // Put the outputs in
+  for (auto& x : outputs) {
+    set_array_buffer(compute_encoder, x, cnt++);
+  }
+
+  // Put the output shape and strides in
+  if (!contiguous) {
+    compute_encoder->setBytes(
+        strides[0].data(), strides[0].size() * sizeof(size_t), cnt++);
+    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), cnt++);
+  }
+
+  // Put the number of dims in if it is dynamic
+  if (dynamic) {
+    compute_encoder->setBytes(&ndim, sizeof(int), cnt++);
+  }
+
+  // Launch the kernel
+  if (contiguous) {
+    size_t nthreads = outputs[0].size();
+    MTL::Size grid_dims(nthreads, 1, 1);
+    MTL::Size group_dims(
+        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+  } else {
+    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
+    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
+    size_t rest = outputs[0].size() / (dim0 * dim1);
+    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+    if (thread_group_size != 1024) {
+      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
+    }
+    auto group_dims = get_block_dims(dim0, dim1, rest);
+    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/compiled_preamble.h
+++ b/mlx/backend/metal/compiled_preamble.h
@@ -0,0 +1,9 @@
+// Copyright © 2023-24 Apple Inc.
+
+#pragma once
+
+namespace mlx::core::metal {
+
+const char* get_kernel_preamble();
+
+}
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -2,7 +2,6 @@

 #include <algorithm>
 #include <cassert>
-#include <iostream>
 #include <numeric>
 #include <sstream>

@@ -68,9 +67,9 @@ void explicit_gemm_conv_1D_gpu(
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy_gpu(in_strided_view, in_strided, CopyType::General, s);

-  // Peform gemm
+  // Perform gemm
  std::vector<array> copies = {in_padded, in_strided};
-  mlx_matmul(
+  return steel_matmul(
      s,
      d,
      /*a = */ in_strided,
@@ -260,9 +259,9 @@ void explicit_gemm_conv_2D_gpu(
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy_gpu(in_strided_view, in_strided, CopyType::General, s);

-  // Peform gemm
+  // Perform gemm
  std::vector<array> copies = {in_padded, in_strided};
-  mlx_matmul(
+  return steel_matmul(
      s,
      d,
      /*a = */ in_strided,
@@ -411,7 +410,7 @@ void winograd_conv_2D_gpu(
  copies_w.push_back(out_wg);
  {
    std::vector<array> empty_copies;
-    mlx_matmul(
+    steel_matmul(
        s,
        d,
        /*a = */ inp_wg,
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -12,14 +12,21 @@ namespace mlx::core {

 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
-    out.set_data(
-        allocator::malloc_or_wait(in.data_size() * out.itemsize()),
-        in.data_size(),
-        in.strides(),
-        in.flags());
+    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
+      out.move_shared_buffer(in);
+    } else {
+      out.set_data(
+          allocator::malloc_or_wait(in.data_size() * out.itemsize()),
+          in.data_size(),
+          in.strides(),
+          in.flags());
+    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }
+  if (out.size() == 0) {
+    return;
+  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
@@ -64,7 +71,8 @@ void copy_gpu_inplace(
  auto kernel = d.get_kernel(kname.str());
  auto compute_encoder = d.get_command_encoder(s.index);
  compute_encoder->setComputePipelineState(kernel);
-  set_array_buffer(compute_encoder, in, 0);
+  bool donate_in = in.data_shared_ptr() == nullptr;
+  set_array_buffer(compute_encoder, donate_in ? out : in, 0);
  set_array_buffer(compute_encoder, out, 1);

  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-24 Apple Inc.

 #include <dlfcn.h>
 #include <cstdlib>
@@ -17,8 +17,6 @@ namespace fs = std::filesystem;

 namespace mlx::core::metal {

-static Device metal_device_;
-
 namespace {

 // TODO nicer way to set this or possibly expose as an environment variable
@@ -27,7 +25,9 @@ static constexpr int MAX_BUFFERS_PER_QUEUE = 12;
 static constexpr const char* default_mtllib_path = METAL_PATH;

 auto load_device() {
-  MTL::Device* device = MTL::CreateSystemDefaultDevice();
+  auto devices = MTL::CopyAllDevices();
+  auto device = static_cast<MTL::Device*>(devices->object(0))
+      ?: MTL::CreateSystemDefaultDevice();
  if (!device) {
    throw std::runtime_error("Failed to load device");
  }
@@ -112,32 +112,35 @@ MTL::Library* load_library(

 } // namespace

-Device::Device()
-    : pool_(NS::AutoreleasePool::alloc()->init()),
-      device_(load_device()),
-      library_map_({{"mlx", load_library(device_)}}) {}
+Device::Device() {
+  auto pool = new_scoped_memory_pool();
+  device_ = load_device();
+  library_map_ = {{"mlx", load_library(device_)}};
+}

 Device::~Device() {
+  auto pool = new_scoped_memory_pool();
  for (auto& q : queue_map_) {
    q.second->release();
  }
-  for (auto& k : kernel_map_) {
-    k.second->release();
-  }
-  for (auto& l : library_map_) {
-    l.second->release();
-  }
  for (auto& b : buffer_map_) {
    b.second.second->release();
  }
  for (auto& e : encoder_map_) {
    e.second->release();
  }
+  for (auto& k : kernel_map_) {
+    k.second->release();
+  }
+  for (auto& l : library_map_) {
+    l.second->release();
+  }
  device_->release();
-  pool_->release();
 }

 void Device::new_queue(int index) {
+  auto thread_pool = metal::new_scoped_memory_pool();
+
  // Multiple threads can ask the device for queues
  // We lock this as a critical section for safety
  const std::lock_guard<std::mutex> lock(mtx_);
@@ -240,36 +243,127 @@ void Device::register_library(
  }
 }

-MTL::ComputePipelineState* Device::get_kernel(
-    const std::string& name,
-    const std::string& lib_name /* = "mlx" */) {
-  // Look for cached kernel
-  if (auto it = kernel_map_.find(name); it != kernel_map_.end()) {
-    return it->second;
-  }
-
-  // Prepare new kernel
-
+MTL::Library* Device::get_library_cache_(const std::string& lib_name) {
  // Search for cached metal lib
  MTL::Library* mtl_lib;
-  if (auto it = library_map_.find(name); it != library_map_.end()) {
+  if (auto it = library_map_.find(lib_name); it != library_map_.end()) {
    mtl_lib = it->second;
  } else { // Look for metallib alongside library
    register_library(lib_name);
    mtl_lib = library_map_[lib_name];
  }

+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library_(const std::string& source_string) {
+  auto pool = new_scoped_memory_pool();
+
+  auto ns_code =
+      NS::String::string(source_string.c_str(), NS::ASCIIStringEncoding);
+
+  NS::Error* error = nullptr;
+  auto mtl_lib = device_->newLibrary(ns_code, nullptr, &error);
+
+  // Throw error if unable to compile library
+  if (!mtl_lib) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load build metal library from source"
+        << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library_(const MTL::StitchedLibraryDescriptor* desc) {
+  auto pool = new_scoped_memory_pool();
+
+  NS::Error* error = nullptr;
+  auto mtl_lib = device_->newLibrary(desc, &error);
+
+  // Throw error if unable to compile library
+  if (!mtl_lib) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load build stitched metal library"
+        << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return mtl_lib;
+}
+
+MTL::Function* Device::get_function_(
+    const std::string& name,
+    MTL::Library* mtl_lib) {
  // Pull kernel from library
  auto ns_name = NS::String::string(name.c_str(), NS::ASCIIStringEncoding);
  auto mtl_function = mtl_lib->newFunction(ns_name);

+  return mtl_function;
+}
+
+MTL::Function* Device::get_function_(
+    const std::string& name,
+    const std::string& specialized_name,
+    const MTLFCList& func_consts,
+    MTL::Library* mtl_lib) {
+  if (func_consts.empty() && (specialized_name == name)) {
+    return get_function_(name, mtl_lib);
+  }
+
+  // Prepare function constants
+  auto mtl_func_consts = MTL::FunctionConstantValues::alloc()->init();
+
+  for (auto [value, type, index] : func_consts) {
+    mtl_func_consts->setConstantValue(value, type, index);
+  }
+
+  // Prepare function desc
+  auto desc = MTL::FunctionDescriptor::functionDescriptor();
+  desc->setName(NS::String::string(name.c_str(), NS::ASCIIStringEncoding));
+  desc->setSpecializedName(
+      NS::String::string(specialized_name.c_str(), NS::ASCIIStringEncoding));
+  desc->setConstantValues(mtl_func_consts);
+
+  // Pull kernel from library
+  NS::Error* error = nullptr;
+  auto mtl_function = mtl_lib->newFunction(desc, &error);
+
+  // Throw error if unable to build metal function
+  if (!mtl_function) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load function " << name << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  mtl_func_consts->release();
+  desc->release();
+
+  return mtl_function;
+}
+
+MTL::ComputePipelineState* Device::get_kernel_(
+    const std::string& name,
+    const MTL::Function* mtl_function) {
  // Compile kernel to compute pipeline
  NS::Error* error = nullptr;
  MTL::ComputePipelineState* kernel;
+
  if (mtl_function) {
    kernel = device_->newComputePipelineState(mtl_function, &error);
-    mtl_function->release();
  }
+
+  // Throw error if unable to compile metal function
  if (!mtl_function || !kernel) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load kernel " << name << "\n";
@@ -279,23 +373,188 @@ MTL::ComputePipelineState* Device::get_kernel(
    throw std::runtime_error(msg.str());
  }

-  // Add kernel to cache
-  kernel_map_.insert({name, kernel});
  return kernel;
 }

-Device& device(mlx::core::Device) {
-  return metal_device_;
+MTL::ComputePipelineState* Device::get_kernel_(
+    const std::string& name,
+    const MTL::Function* mtl_function,
+    const MTL::LinkedFunctions* linked_functions) {
+  // Check inputs
+  if (!linked_functions) {
+    return get_kernel_(name, mtl_function);
+  }
+
+  if (!mtl_function) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load kernel " << name << "\n";
+    throw std::runtime_error(msg.str());
+  }
+
+  // Prepare compute pipeline state descriptor
+  auto desc = MTL::ComputePipelineDescriptor::alloc()->init();
+  desc->setComputeFunction(mtl_function);
+  desc->setLinkedFunctions(linked_functions);
+
+  // Compile kernel to compute pipeline
+  NS::Error* error = nullptr;
+  auto kernel = device_->newComputePipelineState(
+      desc, MTL::PipelineOptionNone, nullptr, &error);
+
+  // Throw error if unable to compile metal function
+  if (!kernel) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load kernel " << name << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  return kernel;
 }

-NS::AutoreleasePool*& thread_autorelease_pool() {
-  static thread_local NS::AutoreleasePool* p =
-      NS::AutoreleasePool::alloc()->init();
-  return p;
+MTL::Library* Device::get_library(const std::string& name) {
+  auto it = library_map_.find(name);
+  return (it != library_map_.end()) ? it->second : nullptr;
+}
+
+MTL::Library* Device::get_library(
+    const std::string& name,
+    const std::string& source,
+    bool cache /* = true */) {
+  if (cache) {
+    if (auto it = library_map_.find(name); it != library_map_.end()) {
+      return it->second;
+    }
+  }
+
+  auto mtl_lib = get_library_(source);
+
+  if (cache) {
+    library_map_.insert({name, mtl_lib});
+  }
+
+  return mtl_lib;
+}
+
+MTL::Library* Device::get_library(
+    const std::string& name,
+    const MTL::StitchedLibraryDescriptor* desc,
+    bool cache /* = true */) {
+  if (cache) {
+    if (auto it = library_map_.find(name); it != library_map_.end()) {
+      return it->second;
+    }
+  }
+
+  auto mtl_lib = get_library_(desc);
+
+  if (cache) {
+    library_map_.insert({name, mtl_lib});
+  }
+
+  return mtl_lib;
+}
+
+MTL::Function* Device::get_function(
+    const std::string& base_name,
+    MTL::Library* mtl_lib,
+    const std::string& specialized_name /* = "" */,
+    const MTLFCList& func_consts /* = {} */) {
+  return get_function_(base_name, specialized_name, func_consts, mtl_lib);
+}
+
+MTL::Function* Device::get_function(
+    const std::string& base_name,
+    const std::string& lib_name /* = "mlx" */,
+    const std::string& specialized_name /*  = "" */,
+    const MTLFCList& func_consts /* = {} */) {
+  // Search for cached metal lib
+  MTL::Library* mtl_lib = get_library_cache_(lib_name);
+
+  return get_function(base_name, mtl_lib, specialized_name, func_consts);
+}
+
+MTL::LinkedFunctions* Device::get_linked_functions_(
+    const std::vector<MTL::Function*>& funcs) {
+  if (funcs.empty()) {
+    return nullptr;
+  }
+
+  auto lfuncs = MTL::LinkedFunctions::linkedFunctions();
+
+  std::vector<NS::Object*> objs(funcs.size());
+  for (int i = 0; i < funcs.size(); i++) {
+    objs[i] = funcs[i];
+  }
+
+  NS::Array* funcs_arr = NS::Array::array(objs.data(), funcs.size());
+
+  lfuncs->setPrivateFunctions(funcs_arr);
+
+  return lfuncs;
+}
+
+MTL::ComputePipelineState* Device::get_kernel(
+    const std::string& base_name,
+    MTL::Library* mtl_lib,
+    const std::string& hash_name /* = "" */,
+    const MTLFCList& func_consts /* = {} */,
+    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
+  auto pool = new_scoped_memory_pool();
+
+  // Look for cached kernel
+  const auto& kname = hash_name.empty() ? base_name : hash_name;
+  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  // Pull kernel from library
+  auto mtl_function = get_function_(base_name, kname, func_consts, mtl_lib);
+
+  // Compile kernel to compute pipeline
+  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
+  auto kernel = get_kernel_(kname, mtl_function, mtl_linked_funcs);
+  mtl_function->release();
+  mtl_linked_funcs->release();
+
+  // Add kernel to cache
+  kernel_map_.insert({kname, kernel});
+  return kernel;
+}
+
+MTL::ComputePipelineState* Device::get_kernel(
+    const std::string& base_name,
+    const std::string& lib_name /* = "mlx" */,
+    const std::string& hash_name /*  = "" */,
+    const MTLFCList& func_consts /*  = {} */,
+    const std::vector<MTL::Function*>& linked_functions /*  = {} */) {
+  // Look for cached kernel
+  const auto& kname = hash_name.size() == 0 ? base_name : hash_name;
+  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  // Search for cached metal lib
+  MTL::Library* mtl_lib = get_library_cache_(lib_name);
+
+  return get_kernel(base_name, mtl_lib, kname, func_consts, linked_functions);
+}
+
+Device& device(mlx::core::Device) {
+  static Device metal_device;
+  return metal_device;
+}
+
+std::shared_ptr<void> new_scoped_memory_pool() {
+  auto dtor = [](void* ptr) {
+    static_cast<NS::AutoreleasePool*>(ptr)->release();
+  };
+  return std::shared_ptr<void>(NS::AutoreleasePool::alloc()->init(), dtor);
 }

 void new_stream(Stream stream) {
-  thread_autorelease_pool();
  if (stream.device == mlx::core::Device::gpu) {
    device(stream.device).new_queue(stream.index);
  }
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -1,4 +1,4 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-24 Apple Inc.

 #pragma once

@@ -31,6 +31,9 @@ inline std::string get_colocated_mtllib_path(const std::string& lib_name) {
  return mtllib_path;
 }

+using MTLFCList =
+    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;
+
 class Device {
 public:
  Device();
@@ -59,15 +62,73 @@ class Device {
      const std::function<std::string(const std::string&)>& lib_path_func =
          get_colocated_mtllib_path);

-  MTL::ComputePipelineState* get_kernel(
+  MTL::Library* get_library(const std::string& name);
+
+  MTL::Library* get_library(
      const std::string& name,
-      const std::string& lib_name = "mlx");
+      const std::string& source_string,
+      bool cache = true);
+
+  MTL::Library* get_library(
+      const std::string& name,
+      const MTL::StitchedLibraryDescriptor* desc,
+      bool cache = true);
+
+  MTL::Function* get_function(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& specialized_name = "",
+      const MTLFCList& func_consts = {});
+
+  MTL::Function* get_function(
+      const std::string& base_name,
+      const std::string& lib_name = "mlx",
+      const std::string& specialized_name = "",
+      const MTLFCList& func_consts = {});
+
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      const std::string& lib_name = "mlx",
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});

  MTL::ArgumentEncoder* argument_encoder(
      const std::vector<MTL::ArgumentDescriptor*>& arg_descs) const;

 private:
-  NS::AutoreleasePool* pool_;
+  MTL::Library* get_library_cache_(const std::string& name);
+
+  MTL::Library* get_library_(const std::string& source_string);
+  MTL::Library* get_library_(const MTL::StitchedLibraryDescriptor* desc);
+
+  MTL::Function* get_function_(const std::string& name, MTL::Library* mtl_lib);
+
+  MTL::Function* get_function_(
+      const std::string& name,
+      const std::string& specialized_name,
+      const MTLFCList& func_consts,
+      MTL::Library* mtl_lib);
+
+  MTL::LinkedFunctions* get_linked_functions_(
+      const std::vector<MTL::Function*>& funcs);
+
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function);
+
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function,
+      const MTL::LinkedFunctions* linked_functions);
+
  MTL::Device* device_;
  std::unordered_map<int32_t, MTL::CommandQueue*> queue_map_;
  std::unordered_map<int32_t, std::pair<int, MTL::CommandBuffer*>> buffer_map_;
@@ -78,6 +139,5 @@ class Device {
 };

 Device& device(mlx::core::Device);
-NS::AutoreleasePool*& thread_autorelease_pool();

 } // namespace mlx::core::metal
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -1,5 +1,4 @@
-// Copyright © 2023 Apple Inc.
-
+// Copyright © 2023-2024 Apple Inc.
 #include <algorithm>
 #include <cassert>
 #include <numeric>
@@ -33,13 +32,22 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  if (out.size() == 0) {
+    return;
+  }

  auto& s = stream();
  auto& d = metal::device(s.device);

+  int idx_ndim = nidx ? inputs[1].ndim() : 0;
+  size_t ndim = src.ndim();
+
  std::ostringstream kname;
  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
  kname << "gather" << type_to_name(src) << idx_type_name << "_" << nidx;
+  if (idx_ndim <= 1) {
+    kname << "_" << idx_ndim;
+  }

  auto compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
@@ -49,15 +57,11 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
    slice_size *= s;
  }

-  size_t ndim = src.ndim();
-  size_t nthreads = out.size();
-  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-  if (thread_group_size > nthreads) {
-    thread_group_size = nthreads;
-  }
-
-  MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
-  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
+  // Launch 2D grid of threads: indices x slice
+  size_t dim0 = out.size() / slice_size;
+  size_t dim1 = slice_size;
+  auto group_dims = get_block_dims(dim0, dim1, 1);
+  MTL::Size grid_dims = MTL::Size(dim0, dim1, 1);

  compute_encoder->setComputePipelineState(kernel);

@@ -88,7 +92,6 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto arg_enc = d.argument_encoder(arg_descs);

  // Allocate and fill buffers for shapes and strides
-  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  auto idx_shapes_buf = allocator::malloc_or_wait(sizeof(int) * idx_ndim);
  auto idx_strides_buf = allocator::malloc_or_wait(sizeof(size_t) * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
@@ -102,7 +105,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
        static_cast<size_t*>(idx_strides_buf.raw_ptr()) + i * idx_ndim);
  }

-  // Allocate the argument bufer
+  // Allocate the argument buffer
  auto arg_buf = allocator::malloc_or_wait(arg_enc->encodedLength());

  // Register data with the encoder
@@ -110,26 +113,30 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  for (int i = 0; i < nidx; ++i) {
    set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i);
  }
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), MTL::ResourceUsageRead);
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), MTL::ResourceUsageRead);
+  if (idx_ndim > 0) {
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()),
+        MTL::ResourceUsageRead);
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()),
+        MTL::ResourceUsageRead);
+  }
  *static_cast<int*>(arg_enc->constantData(nidx + 3)) = idx_ndim;

  // Set all the buffers
  set_array_buffer(compute_encoder, src, 0);
  compute_encoder->setBuffer(static_cast<MTL::Buffer*>(arg_buf.ptr()), 0, 1);
  set_array_buffer(compute_encoder, out, 2);
+
  compute_encoder->setBytes(src.shape().data(), ndim * sizeof(int), 3);
  compute_encoder->setBytes(src.strides().data(), ndim * sizeof(size_t), 4);
  compute_encoder->setBytes(&ndim, sizeof(size_t), 5);
  compute_encoder->setBytes(slice_sizes_.data(), ndim * sizeof(int), 6);
-  compute_encoder->setBytes(&slice_size, sizeof(size_t), 7);
-  compute_encoder->setBytes(axes_.data(), nidx * sizeof(int), 8);
+  compute_encoder->setBytes(axes_.data(), nidx * sizeof(int), 7);

  compute_encoder->dispatchThreads(grid_dims, group_dims);

@@ -163,6 +170,11 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
      inputs[0].data_size() == 1 ? CopyType::Scalar : CopyType::General;
  copy_gpu(inputs[0], out, copy_type);

+  // Empty update
+  if (inputs.back().size() == 0) {
+    return;
+  }
+
  // Get stream
  auto& s = stream();
  auto& d = metal::device(s.device);
@@ -246,7 +258,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
        static_cast<size_t*>(idx_strides_buf.raw_ptr()) + i * idx_ndim);
  }

-  // Allocate the argument bufer
+  // Allocate the argument buffer
  auto arg_buf = allocator::malloc_or_wait(arg_enc->encodedLength());

  // Register data with the encoder
@@ -254,14 +266,18 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  for (int i = 0; i < nidx; ++i) {
    set_array_buffer(compute_encoder, arg_enc, inputs[i + 1], i);
  }
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), MTL::ResourceUsageRead);
-  arg_enc->setBuffer(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
-  compute_encoder->useResource(
-      static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), MTL::ResourceUsageRead);
+  if (idx_ndim > 0) {
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()), 0, nidx + 1);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_shapes_buf.ptr()),
+        MTL::ResourceUsageRead);
+    arg_enc->setBuffer(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()), 0, nidx + 2);
+    compute_encoder->useResource(
+        static_cast<MTL::Buffer*>(idx_strides_buf.ptr()),
+        MTL::ResourceUsageRead);
+  }
  *static_cast<int*>(arg_enc->constantData(nidx + 3)) = idx_ndim;

  compute_encoder->setBuffer(static_cast<MTL::Buffer*>(arg_buf.ptr()), 0, 0);
@@ -272,14 +288,32 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  }
  set_array_buffer(compute_encoder, upd, 1);
  set_array_buffer(compute_encoder, out, 2);
-  compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3);
-  compute_encoder->setBytes(upd.strides().data(), upd_ndim * sizeof(size_t), 4);
+  if (upd_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    size_t stride_ = 0;
+    compute_encoder->setBytes(&shape_, sizeof(int), 3);
+    compute_encoder->setBytes(&stride_, sizeof(size_t), 4);
+  } else {
+    compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3);
+    compute_encoder->setBytes(
+        upd.strides().data(), upd_ndim * sizeof(size_t), 4);
+  }
  compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 5);
  compute_encoder->setBytes(&upd_size, sizeof(size_t), 6);

  size_t out_ndim = out.ndim();
-  compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7);
-  compute_encoder->setBytes(out.strides().data(), out_ndim * sizeof(size_t), 8);
+  if (out_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    size_t stride_ = 0;
+    compute_encoder->setBytes(&shape_, sizeof(int), 7);
+    compute_encoder->setBytes(&stride_, sizeof(size_t), 8);
+  } else {
+    compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7);
+    compute_encoder->setBytes(
+        out.strides().data(), out_ndim * sizeof(size_t), 8);
+  }
  compute_encoder->setBytes(&out_ndim, sizeof(size_t), 9);
  compute_encoder->setBytes(axes_.data(), axes_.size() * sizeof(int), 10);

--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(
  HEADERS
+  ${CMAKE_CURRENT_SOURCE_DIR}/atomic.h
  ${CMAKE_CURRENT_SOURCE_DIR}/bf16.h
  ${CMAKE_CURRENT_SOURCE_DIR}/bf16_math.h
  ${CMAKE_CURRENT_SOURCE_DIR}/complex.h
@@ -14,9 +15,9 @@ set(
  "arange"
  "arg_reduce"
  "binary"
+  "binary_two"
  "conv"
  "copy"
-  "gemm"
  "gemv"
  "quantized"
  "random"
@@ -28,26 +29,27 @@ set(
  "indexing"
 )

-function(build_kernel KERNEL)
-  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
-  set(HEADERS_PADDED ${HEADERS})
-  if(${KERNEL} STREQUAL "gemm")
-    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/gemm.h)
-  endif()
-  if(${KERNEL} STREQUAL "conv")
-    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/conv.h)
-  endif()
+function(build_kernel_base TARGET SRCFILE DEPS)
  add_custom_command(
    COMMAND xcrun -sdk macosx metal -Wall -Wextra
                  -fno-fast-math
                  -c ${SRCFILE} 
                  -I${PROJECT_SOURCE_DIR} 
-                  -o ${KERNEL}.air
-    DEPENDS ${SRCFILE} ${HEADERS_PADDED}
-    OUTPUT ${KERNEL}.air
-    COMMENT "Building ${KERNEL}.air"
+                  -o ${TARGET}.air
+    DEPENDS ${SRCFILE} ${DEPS}
+    OUTPUT ${TARGET}.air
+    COMMENT "Building ${TARGET}.air"
    VERBATIM
  )
+endfunction(build_kernel_base)
+
+function(build_kernel KERNEL)
+  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
+  set(HEADERS_PADDED ${HEADERS})
+  if(${KERNEL} STREQUAL "conv")
+    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/conv.h)
+  endif()
+  build_kernel_base(${KERNEL} ${SRCFILE} "${HEADERS_PADDED}")
 endfunction(build_kernel)

 foreach(KERNEL ${KERNELS})
@@ -55,6 +57,15 @@ foreach(KERNEL ${KERNELS})
  set(KERNEL_AIR ${KERNEL}.air ${KERNEL_AIR})
 endforeach()

+file(GLOB_RECURSE STEEL_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.metal)
+file(GLOB_RECURSE STEEL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/steel/*.h)
+
+foreach(KERNEL ${STEEL_KERNELS})
+  cmake_path(GET KERNEL STEM TARGET)
+  build_kernel_base(${TARGET} ${KERNEL} "${STEEL_HEADERS}")
+  set(KERNEL_AIR ${TARGET}.air ${KERNEL_AIR})
+endforeach()
+
 add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o ${MLX_METAL_PATH}/mlx.metallib
--- a/mlx/backend/metal/kernels/arg_reduce.metal
+++ b/mlx/backend/metal/kernels/arg_reduce.metal
@@ -63,18 +63,6 @@ struct ArgMax {
  }
 };

-bool simd_shuffle_down(bool data, uint16_t delta) {
-  return simd_shuffle_down(static_cast<uint32_t>(data), delta);
-}
-
-uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {
-  return as_type<uint64_t>(simd_shuffle_down(as_type<uint2>(data), delta));
-}
-
-int64_t simd_shuffle_down(int64_t data, uint16_t delta) {
-  return as_type<int64_t>(simd_shuffle_down(as_type<uint2>(data), delta));
-}
-
 template <typename U>
 IndexValPair<U> simd_shuffle_down(IndexValPair<U> data, uint16_t delta) {
  return IndexValPair<U>(
@@ -114,7 +102,7 @@ template <typename T, typename Op, int N_READS>
  //    4. Reduce among them and go to 3
  //    4. Reduce in each simd_group
  //    6. Write in the thread local memory
-  //    6. Reduce them accross thread group
+  //    6. Reduce them across thread group
  //    7. Write the output without need for atomic
  Op op;

--- a/mlx/backend/metal/kernels/atomic.h
+++ b/mlx/backend/metal/kernels/atomic.h
@@ -38,49 +38,59 @@ struct mlx_atomic<T, enable_if_t<is_metal_atomic<T>>> {

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC T
-mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {
+mlx_atomic_load_explicit(device mlx_atomic<T>* object, uint offset) {
  return atomic_load_explicit(&(object[offset].val), memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  atomic_store_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_and_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_and_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  atomic_fetch_or_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_min_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_min_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_min_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_max_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_max_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_max_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_add_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_add_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  atomic_fetch_add_explicit(&(object[offset].val), val, memory_order_relaxed);
 }

 template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_mul_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_mul_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  T expected = mlx_atomic_load_explicit(object, offset);
  while (!mlx_atomic_compare_exchange_weak_explicit(
      object, &expected, val * expected, offset)) {
@@ -92,7 +102,7 @@ METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread T* expected,
    T val,
-    int offset) {
+    uint offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
@@ -106,7 +116,7 @@ template <>
 METAL_FUNC void mlx_atomic_fetch_min_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
-    int offset) {
+    uint offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val < expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
@@ -121,7 +131,7 @@ template <>
 METAL_FUNC void mlx_atomic_fetch_max_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
-    int offset) {
+    uint offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val > expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
@@ -148,7 +158,7 @@ union uint_or_packed {

 template <typename T, typename Op>
 struct mlx_atomic_update_helper {
-  uint operator()(uint_or_packed<T> init, T update, int elem_offset) {
+  uint operator()(uint_or_packed<T> init, T update, uint elem_offset) {
    Op op;
    init.val[elem_offset] = op(update, init.val[elem_offset]);
    return init.bits;
@@ -159,9 +169,9 @@ template <typename T, typename Op>
 METAL_FUNC void mlx_atomic_update_and_store(
    device mlx_atomic<T>* object,
    T update,
-    int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+    uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;

  mlx_atomic_update_helper<T, Op> helper;
  uint_or_packed<T> expected;
@@ -242,9 +252,9 @@ struct __Min {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC T
-mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {
-  int pack_offset = offset / sizeof(T);
-  int elem_offset = offset % sizeof(T);
+mlx_atomic_load_explicit(device mlx_atomic<T>* object, uint offset) {
+  uint pack_offset = offset / sizeof(T);
+  uint elem_offset = offset % sizeof(T);
  uint_or_packed<T> packed_val;
  packed_val.bits =
      atomic_load_explicit(&(object[pack_offset].val), memory_order_relaxed);
@@ -253,15 +263,17 @@ mlx_atomic_load_explicit(device mlx_atomic<T>* object, int offset) {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, int offset) {
+mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, uint offset) {
  mlx_atomic_update_and_store<T, __None<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+METAL_FUNC void mlx_atomic_fetch_and_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = __UINT32_MAX__;
  identity.val[elem_offset] = val;
@@ -272,9 +284,9 @@ mlx_atomic_fetch_and_explicit(device mlx_atomic<T>* object, T val, int offset) {

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
 METAL_FUNC void
-mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
-  int pack_offset = offset / packing_size<T>;
-  int elem_offset = offset % packing_size<T>;
+mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, uint offset) {
+  uint pack_offset = offset / packing_size<T>;
+  uint elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = 0;
  identity.val[elem_offset] = val;
@@ -284,26 +296,34 @@ mlx_atomic_fetch_or_explicit(device mlx_atomic<T>* object, T val, int offset) {
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_min_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_min_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Min<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_max_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_max_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Max<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_add_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_add_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Add<T>>(object, val, offset);
 }

 template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
-METAL_FUNC void
-mlx_atomic_fetch_mul_explicit(device mlx_atomic<T>* object, T val, int offset) {
+METAL_FUNC void mlx_atomic_fetch_mul_explicit(
+    device mlx_atomic<T>* object,
+    T val,
+    uint offset) {
  mlx_atomic_update_and_store<T, __Mul<T>>(object, val, offset);
 }

@@ -312,11 +332,11 @@ METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread uint* expected,
    uint val,
-    int offset) {
+    uint offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
      val,
      memory_order_relaxed,
      memory_order_relaxed);
-}
+}
--- a/mlx/backend/metal/kernels/binary.h
+++ b/mlx/backend/metal/kernels/binary.h
@@ -0,0 +1,221 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <metal_integer>
+#include <metal_math>
+
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/utils.h"
+
+struct Add {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct Divide {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x % y;
+  }
+  template <>
+  float operator()(float x, float y) {
+    return fmod(x, y);
+  }
+  template <>
+  half operator()(half x, half y) {
+    return fmod(x, y);
+  }
+  template <>
+  bfloat16_t operator()(bfloat16_t x, bfloat16_t y) {
+    return fmod(x, y);
+  }
+};
+
+struct Equal {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y || (metal::isnan(x) && metal::isnan(y));
+  }
+  template <>
+  bool operator()(complex64_t x, complex64_t y) {
+    return x == y ||
+        (metal::isnan(x.real) && metal::isnan(y.real) && metal::isnan(x.imag) &&
+         metal::isnan(y.imag)) ||
+        (x.real == y.real && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
+        (metal::isnan(x.real) && metal::isnan(y.real) && x.imag == y.imag);
+  }
+};
+
+struct Greater {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  T operator()(T x, T y) {
+    if (metal::isnan(x) || metal::isnan(y)) {
+      return metal::numeric_limits<T>::quiet_NaN();
+    }
+    constexpr T inf = metal::numeric_limits<T>::infinity();
+    T maxval = metal::max(x, y);
+    T minval = metal::min(x, y);
+    return (minval == -inf || maxval == inf)
+        ? maxval
+        : (maxval + log1p(metal::exp(minval - maxval)));
+  };
+};
+
+struct Maximum {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::max(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x > y ? x : y;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x > y ? x : y;
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
+    return metal::min(x, y);
+  }
+
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
+    if (metal::isnan(x)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x != y;
+  }
+  template <>
+  bool operator()(complex64_t x, complex64_t y) {
+    return x.real != y.real || x.imag != y.imag;
+  }
+};
+
+struct Power {
+  template <typename T>
+  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T base, T exp) {
+    return metal::pow(base, exp);
+  }
+
+  template <typename T>
+  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T base, T exp) {
+    T res = 1;
+    while (exp) {
+      if (exp & 1) {
+        res *= base;
+      }
+      exp >>= 1;
+      base *= base;
+    }
+    return res;
+  }
+
+  template <>
+  complex64_t operator()(complex64_t x, complex64_t y) {
+    auto x_theta = metal::atan(x.imag / x.real);
+    auto x_ln_r = 0.5 * metal::log(x.real * x.real + x.imag * x.imag);
+    auto mag = metal::exp(y.real * x_ln_r - y.imag * x_theta);
+    auto phase = y.imag * x_ln_r + y.real * x_theta;
+    return {mag * metal::cos(phase), mag * metal::sin(phase)};
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x && y;
+  };
+};
+
+struct LogicalOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x || y;
+  };
+};
--- a/mlx/backend/metal/kernels/binary.metal
+++ b/mlx/backend/metal/kernels/binary.metal
@@ -1,135 +1,6 @@
-// Copyright © 2023 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

-#include <metal_integer>
-#include <metal_math>
-
-#include "mlx/backend/metal/kernels/utils.h"
-#include "mlx/backend/metal/kernels/bf16.h"
-
-struct Add {
-  template <typename T> T operator()(T x, T y) { return x + y; }
-};
-
-struct Divide {
-  template <typename T> T operator()(T x, T y) { return x / y; }
-};
-
-struct Remainder {
-  template <typename T> T operator()(T x, T y) { return x % y; }
-  template <> float operator()(float x, float y) { return fmod(x, y); }
-  template <> half operator()(half x, half y) { return fmod(x, y); }
-  template <> bfloat16_t operator()(bfloat16_t x, bfloat16_t y) { return fmod(x, y); }
-};
-
-struct Equal {
-  template <typename T> bool operator()(T x, T y) { return x == y; }
-};
-
-struct NaNEqual {
-  template <typename T> bool operator()(T x, T y) {
-    return x == y || (metal::isnan(x) && metal::isnan(y));
-  }
-  template <>
-  bool operator()(complex64_t x, complex64_t y) {
-    return x == y ||
-      (metal::isnan(x.real) && metal::isnan(y.real)
-       && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
-      (x.real == y.real && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
-      (metal::isnan(x.real) && metal::isnan(y.real) && x.imag == y.imag);
-  }
-};
-
-struct Greater {
-  template <typename T> bool operator()(T x, T y) { return x > y; }
-};
-
-struct GreaterEqual {
-  template <typename T> bool operator()(T x, T y) { return x >= y; }
-};
-
-struct Less {
-  template <typename T> bool operator()(T x, T y) { return x < y; }
-};
-
-struct LessEqual {
-  template <typename T> bool operator()(T x, T y) { return x <= y; }
-};
-
-struct LogAddExp {
-  template <typename T>
-  T operator()(T x, T y) {
-    constexpr T inf = metal::numeric_limits<T>::infinity();
-    T maxval = metal::max(x, y);
-    T minval = metal::min(x, y);
-    return (minval == -inf || maxval == inf) ? maxval :
-      (maxval + log1p(metal::exp(minval - maxval)));
-  };
-};
-
-struct Maximum {
-  template <typename T> T operator()(T x, T y) { return metal::max(x, y); }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x >= y ? x : y;
-  }
-};
-
-struct Minimum {
-  template <typename T> T operator()(T x, T y) { return metal::min(x, y); }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    return x <= y ? x : y;
-  }
-};
-
-struct Multiply {
-  template <typename T> T operator()(T x, T y) { return x * y; }
-};
-
-struct NotEqual {
-  template <typename T> bool operator()(T x, T y) { return x != y; }
-  template <>
-  bool operator()(complex64_t x, complex64_t y) {
-    return x.real != y.real || x.imag != y.imag;
-  }
-};
-
-struct Power {
-
-  template <typename T>
-  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T base, T exp) {
-    return metal::pow(base, exp);
-  }
-
-  template <typename T>
-  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T base, T exp) {
-    T res = 1;
-    while (exp) {
-      if (exp & 1) {
-        res *= base;
-      }
-      exp >>= 1;
-      base *= base;
-    }
-    return res;
-  }
-
-  template <>
-  complex64_t operator()(complex64_t x, complex64_t y) {
-    auto x_theta = metal::atan(x.imag / x.real);
-    auto x_ln_r = 0.5 * metal::log(x.real * x.real + x.imag * x.imag);
-    auto mag = metal::exp(y.real * x_ln_r - y.imag * x_theta);
-    auto phase = y.imag * x_ln_r + y.real * x_theta;
-    return {mag * metal::cos(phase), mag * metal::sin(phase)};
-  }
-};
-
-
-struct Subtract {
-  template <typename T> T operator()(T x, T y) { return x - y; }
-};
+#include "mlx/backend/metal/kernels/binary.h"

 template <typename T, typename U, typename Op>
 [[kernel]] void binary_op_s2s(
@@ -377,3 +248,6 @@ instantiate_binary_all(naneq, float16, half, bool, NaNEqual)
 instantiate_binary_all(naneq, float32, float, bool, NaNEqual)
 instantiate_binary_all(naneq, bfloat16, bfloat16_t, bool, NaNEqual)
 instantiate_binary_all(naneq, complex64, complex64_t, bool, NaNEqual)
+
+instantiate_binary_all(lor, bool_, bool, bool, LogicalOr)
+instantiate_binary_all(land, bool_, bool, bool, LogicalAnd)
--- a/mlx/backend/metal/kernels/binary_two.metal
+++ b/mlx/backend/metal/kernels/binary_two.metal
@@ -0,0 +1,259 @@
+// Copyright © 2023 Apple Inc.
+
+#include <metal_integer>
+#include <metal_math>
+
+#include "mlx/backend/metal/kernels/utils.h"
+#include "mlx/backend/metal/kernels/bf16.h"
+
+struct FloorDivide {
+  template <typename T> T operator()(T x, T y) { return x / y; }
+  template <> float operator()(float x, float y) { return trunc(x / y); }
+  template <> half operator()(half x, half y) { return trunc(x / y); }
+  template <> bfloat16_t operator()(bfloat16_t x, bfloat16_t y) { return trunc(x / y); }
+};
+
+struct Remainder {
+  template <typename T> T operator()(T x, T y) { return x % y; }
+  template <> float operator()(float x, float y) { return fmod(x, y); }
+  template <> half operator()(half x, half y) { return fmod(x, y); }
+  template <> bfloat16_t operator()(bfloat16_t x, bfloat16_t y) { return fmod(x, y); }
+};
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_s2s(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[0]);
+  d[index] = Op2()(a[0], b[0]);
+}
+
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_ss(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[0]);
+  d[index] = Op2()(a[0], b[0]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_sv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[0], b[index]);
+  d[index] = Op2()(a[0], b[index]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_vs(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[index], b[0]);
+  d[index] = Op2()(a[index], b[0]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_vv(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    uint index [[thread_position_in_grid]]) {
+  c[index] = Op1()(a[index], b[index]);
+  d[index] = Op2()(a[index], b[index]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd1(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t& a_stride,
+    constant const size_t& b_stride,
+    uint index [[thread_position_in_grid]]) {
+  auto a_idx = elem_to_loc_1(index, a_stride);
+  auto b_idx = elem_to_loc_1(index, b_stride);
+  c[index] = Op1()(a[a_idx], b[b_idx]);
+  d[index] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd2(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[2],
+    constant const size_t b_strides[2],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_2(index, a_strides);
+  auto b_idx = elem_to_loc_2(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * index.y;
+  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
+  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g_nd3(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const size_t a_strides[3],
+    constant const size_t b_strides[3],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto a_idx = elem_to_loc_3(index, a_strides);
+  auto b_idx = elem_to_loc_3(index, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[a_idx], b[b_idx]);
+  d[out_idx] = Op2()(a[a_idx], b[b_idx]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2, int DIM>
+[[kernel]] void binary_op_g_nd(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int shape[DIM],
+    constant const size_t a_strides[DIM],
+    constant const size_t b_strides[DIM],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd<DIM>(index, shape, a_strides, b_strides);
+  size_t out_idx = index.x + (size_t)grid_dim.x * (index.y + (size_t)grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
+  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
+}
+
+template <typename T, typename U, typename Op1, typename Op2>
+[[kernel]] void binary_op_g(
+    device const T* a,
+    device const T* b,
+    device U* c,
+    device U* d,
+    constant const int* shape,
+    constant const size_t* a_strides,
+    constant const size_t* b_strides,
+    constant const int& ndim,
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto idx = elem_to_loc_2_nd(index, shape, a_strides, b_strides, ndim);
+  size_t out_idx = index.x + grid_dim.x * (index.y + grid_dim.y * index.z);
+  c[out_idx] = Op1()(a[idx.x], b[idx.y]);
+  d[out_idx] = Op2()(a[idx.x], b[idx.y]);
+}
+
+#define instantiate_binary(name, itype, otype, op1, op2, bopt) \
+  template [[host_name(name)]] \
+  [[kernel]] void binary_op_##bopt<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      uint index [[thread_position_in_grid]]);
+
+#define instantiate_binary_g_dim(name, itype, otype, op1, op2, dims) \
+  template [[host_name(name "_" #dims)]] \
+  [[kernel]] void binary_op_g_nd<itype, otype, op1, op2, dims>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const int shape[dims], \
+      constant const size_t a_strides[dims], \
+      constant const size_t b_strides[dims], \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]);
+
+#define instantiate_binary_g_nd(name, itype, otype, op1, op2) \
+  template [[host_name(name "_1")]] \
+  [[kernel]] void binary_op_g_nd1<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t& a_stride, \
+      constant const size_t& b_stride, \
+      uint index [[thread_position_in_grid]]); \
+  template [[host_name(name "_2")]] \
+  [[kernel]] void binary_op_g_nd2<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t a_strides[2], \
+      constant const size_t b_strides[2], \
+      uint2 index [[thread_position_in_grid]], \
+      uint2 grid_dim [[threads_per_grid]]); \
+  template [[host_name(name "_3")]] \
+  [[kernel]] void binary_op_g_nd3<itype, otype, op1, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const size_t a_strides[3], \
+      constant const size_t b_strides[3], \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]); \
+  instantiate_binary_g_dim(name, itype, otype, op1, op2, 4) \
+  instantiate_binary_g_dim(name, itype, otype, op1, op2, 5)
+
+
+#define instantiate_binary_g(name, itype, otype, op1, op2) \
+  template [[host_name(name)]] \
+  [[kernel]] void binary_op_g<itype, otype, op2, op2>( \
+      device const itype* a, \
+      device const itype* b, \
+      device otype* c, \
+      device otype* d, \
+      constant const int* shape, \
+      constant const size_t* a_strides, \
+      constant const size_t* b_strides, \
+      constant const int& ndim, \
+      uint3 index [[thread_position_in_grid]], \
+      uint3 grid_dim [[threads_per_grid]]);
+
+#define instantiate_binary_all(name, tname, itype, otype, op1, op2) \
+  instantiate_binary("ss" #name #tname, itype, otype, op1, op2, ss) \
+  instantiate_binary("sv" #name #tname, itype, otype, op1, op2, sv) \
+  instantiate_binary("vs" #name #tname, itype, otype, op1, op2, vs) \
+  instantiate_binary("vv" #name #tname, itype, otype, op1, op2, vv) \
+  instantiate_binary_g("g" #name #tname, itype, otype, op1, op2) \
+  instantiate_binary_g_nd("g" #name #tname, itype, otype, op1, op2)
+
+#define instantiate_binary_float(name, op1, op2) \
+  instantiate_binary_all(name, float16, half, half, op1, op2) \
+  instantiate_binary_all(name, float32, float, float, op1, op2) \
+  instantiate_binary_all(name, bfloat16, bfloat16_t, bfloat16_t, op1, op2)
+
+#define instantiate_binary_types(name, op1, op2) \
+  instantiate_binary_all(name, bool_, bool, bool, op1, op2) \
+  instantiate_binary_all(name, uint8, uint8_t, uint8_t, op1, op2) \
+  instantiate_binary_all(name, uint16, uint16_t, uint16_t, op1, op2) \
+  instantiate_binary_all(name, uint32, uint32_t, uint32_t, op1, op2) \
+  instantiate_binary_all(name, uint64, uint64_t, uint64_t, op1, op2) \
+  instantiate_binary_all(name, int8, int8_t, int8_t, op1, op2) \
+  instantiate_binary_all(name, int16, int16_t, int16_t, op1, op2) \
+  instantiate_binary_all(name, int32, int32_t, int32_t, op1, op2) \
+  instantiate_binary_all(name, int64, int64_t, int64_t, op1, op2) \
+  instantiate_binary_all(name, complex64, complex64_t, complex64_t, op1, op2) \
+  instantiate_binary_float(name, op1, op2)
+
+instantiate_binary_types(divmod, FloorDivide, Remainder)
--- a/mlx/backend/metal/kernels/compiled_preamble.h
+++ b/mlx/backend/metal/kernels/compiled_preamble.h
@@ -0,0 +1,4 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/backend/metal/kernels/binary.h"
+#include "mlx/backend/metal/kernels/unary.h"
--- a/mlx/backend/metal/kernels/complex.h
+++ b/mlx/backend/metal/kernels/complex.h
@@ -45,7 +45,7 @@ struct complex64_t {
      typename = typename enable_if<can_convert_to_complex64<T>>::type>
  constexpr complex64_t(T x) constant : real(x), imag(0) {}

-  // Converstions from complex64_t
+  // Conversions from complex64_t
  template <
      typename T,
      typename = typename enable_if<can_convert_from_complex64<T>>::type>
--- a/mlx/backend/metal/kernels/gemm/conv.h
+++ b/mlx/backend/metal/kernels/gemm/conv.h
@@ -105,7 +105,7 @@ struct Conv2DInputBlockLoader {
        }
      }

-      // Zero pad otherwize
+      // Zero pad otherwise
      else {
 #pragma clang loop unroll(full)
        for (short j = 0; j < vec_size; ++j) {
@@ -334,7 +334,7 @@ struct Conv2DBlockMMA {
      }

      simdgroup_barrier(mem_flags::mem_none);
-// Multiply and accumulate into resulr simdgroup matrices
+// Multiply and accumulate into result simdgroup matrices
 #pragma clang loop unroll(full)
      for (short i = 0; i < TM; i++) {
 #pragma clang loop unroll(full)
--- a/mlx/backend/metal/kernels/conv.metal
+++ b/mlx/backend/metal/kernels/conv.metal
@@ -5,7 +5,7 @@
 #include "mlx/backend/metal/kernels/conv_params.h"
 #include "mlx/backend/metal/kernels/bf16.h"

-#include "mlx/backend/metal/kernels/gemm/conv.h"
+#include "mlx/backend/metal/kernels/conv.h"

 using namespace metal;

--- a/mlx/backend/metal/kernels/gemm/gemm.h
+++ b/mlx/backend/metal/kernels/gemm/gemm.h
@@ -1,538 +0,0 @@
-// Copyright © 2023 Apple Inc.
-
-#pragma once
-
-#include <metal_simdgroup>
-#include <metal_simdgroup_matrix>
-#include <metal_stdlib>
-
-#define MLX_MTL_CONST static constant constexpr const
-
-using namespace metal;
-
-///////////////////////////////////////////////////////////////////////////////
-// Loading helper
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BROWS,
-    int BCOLS,
-    int BK,
-    int vec_size,
-    int tgp_size,
-    bool transpose,
-    bool ldK,
-    int tgp_padding = 0>
-struct BlockLoader {
-  // Destination dimensions
-  MLX_MTL_CONST int dst_fd = transpose ? BCOLS : BROWS;
-  MLX_MTL_CONST int dst_ld = (transpose ? BROWS : BCOLS) + tgp_padding;
-  MLX_MTL_CONST int n_vecs = (transpose ? BROWS : BCOLS) / vec_size;
-
-  // Stride along block row within the block
-  MLX_MTL_CONST int bstride = tgp_size / n_vecs;
-
-  // Leading dimension for src
-  const int src_ld;
-  // Stride along reduction axis between blocks
-  const int tstride;
-
-  // Thread location indices
-  const short thread_idx;
-  const short bi;
-  const short bj;
-
-  // threadgroup and device memory
-  threadgroup T* dst;
-  const device T* src;
-
-  /* Constructor */
-  METAL_FUNC BlockLoader(
-      const device T* src_,
-      const int src_ld_,
-      threadgroup T* dst_,
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint simd_lane_id [[thread_index_in_simdgroup]])
-      : src_ld(src_ld_),
-        tstride(
-            BK * ((int)(transpose ^ !ldK) * src_ld + (int)(transpose ^ ldK))),
-        thread_idx(simd_group_id * 32 + simd_lane_id),
-        bi(thread_idx / n_vecs),
-        bj(vec_size * (thread_idx % n_vecs)),
-        dst(dst_ + bi * dst_ld + bj),
-        src(src_ + bi * src_ld + bj) {}
-
-  /* Load from device memory into threadgroup memory - without bound checking */
-  METAL_FUNC void load_unsafe() const {
-#pragma clang loop unroll(full)
-    for (short i = 0; i < dst_fd; i += bstride) {
-#pragma clang loop unroll(full)
-      for (short j = 0; j < vec_size; j++) {
-        dst[i * dst_ld + j] = src[i * src_ld + j];
-      }
-    }
-  }
-
-  /* Load from device memory into threadgroup memory - with bound checking */
-  METAL_FUNC void load_safe(short2 src_tile_dim) const {
-    src_tile_dim = transpose ? src_tile_dim.yx : src_tile_dim.xy;
-
-    // Iterate over rows of block
-#pragma clang loop unroll(full)
-    for (short i = 0; i < dst_fd; i += bstride) {
-      // Row is in bounds, we check against column
-      if ((bi + i) < src_tile_dim.y) {
-        // Use fast thread memory for bound checks
-        short tmp_idx[vec_size];
-        T tmp_val[vec_size];
-
-        // Make sure tmp_idx only contains valid indices
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_idx[j] = bj + j < src_tile_dim.x ? j : 0;
-        }
-
-        // Read all valid indcies into tmp_val
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_val[j] = src[i * src_ld + tmp_idx[j]];
-        }
-
-        // Zero out uneeded values
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          tmp_val[j] = bj + j < src_tile_dim.x ? tmp_val[j] : T(0);
-        }
-
-        // Copy values to threadgroup memory
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          dst[i * dst_ld + j] = tmp_val[j];
-        }
-      }
-
-      // Row is out of bounds, we just fill tgp memory with zeros
-      else {
-#pragma clang loop unroll(full)
-        for (short j = 0; j < vec_size; j++) {
-          dst[i * dst_ld + j] = T(0);
-        }
-      }
-    }
-  }
-
-  /* Iteration helper */
-  METAL_FUNC void next() {
-    src += tstride;
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Transforms
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename OutT, typename InT>
-struct TransformNone {
-  static METAL_FUNC OutT apply(InT x) {
-    return static_cast<OutT>(x);
-  }
-};
-
-template <typename T>
-struct AccumHelper {
-  typedef float accum_type;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// MMA helper
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BM,
-    int BN,
-    int BK,
-    int WM,
-    int WN,
-    bool transpose_a,
-    bool transpose_b,
-    int tgp_padding_a = 0,
-    int tgp_padding_b = 0,
-    typename AccumType = typename AccumHelper<T>::accum_type,
-    typename Epilogue = TransformNone<T, AccumType>>
-struct BlockMMA {
-  // Warp tile size along M
-  MLX_MTL_CONST int TM = BM / (WM * 8);
-  // Warp tile size along N
-  MLX_MTL_CONST int TN = BN / (WN * 8);
-
-  // Warp tile simdgroup matrix strides along M
-  MLX_MTL_CONST int TM_stride = 8 * WM;
-  // Warp tile simdgroup matrix strides along M
-  MLX_MTL_CONST int TN_stride = 8 * WN;
-
-  // Leading dimensions of threadgroup A, B blocks
-  MLX_MTL_CONST int lda_tgp = (transpose_a ? BM : BK) + tgp_padding_a;
-  MLX_MTL_CONST int ldb_tgp = (transpose_b ? BK : BN) + tgp_padding_b;
-
-  // Strides of A, B along reduction axis
-  MLX_MTL_CONST short simd_stride_a =
-      transpose_a ? TM_stride : TM_stride * lda_tgp;
-  MLX_MTL_CONST short simd_stride_b =
-      transpose_b ? TN_stride * ldb_tgp : TN_stride;
-
-  // Jump between elements
-  MLX_MTL_CONST short jump_a = transpose_a ? lda_tgp : 1;
-  MLX_MTL_CONST short jump_b = transpose_b ? ldb_tgp : 1;
-
-  // Offsets within threadgroup
-  const int tm;
-  const int tn;
-
-  // Simdgroup matrices
-  simdgroup_matrix<AccumType, 8, 8> Asimd[TM];
-  simdgroup_matrix<AccumType, 8, 8> Bsimd[TN];
-  simdgroup_matrix<AccumType, 8, 8> results[TM * TN] = {
-      simdgroup_matrix<AccumType, 8, 8>(0)};
-
-  short sm;
-  short sn;
-
-  /* Constructor */
-  METAL_FUNC BlockMMA(
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint simd_lane_id [[thread_index_in_simdgroup]])
-      : tm(8 * (simd_group_id / WN)), tn(8 * (simd_group_id % WN)) {
-    short qid = simd_lane_id / 4;
-    sm = (qid & 4) + (simd_lane_id / 2) % 4;
-    sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
-  }
-
-  /* (BM, BK) X (BK, BN) multiply accumulate function */
-  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
-// Iterate over BK in blocks of 8
-#pragma clang loop unroll(full)
-    for (short kk = 0; kk < BK; kk += 8) {
-      short2 offset_a =
-          transpose_a ? short2(tm + sm, kk + sn) : short2(kk + sn, tm + sm);
-      short2 offset_b =
-          transpose_b ? short2(kk + sm, tn + sn) : short2(tn + sn, kk + sm);
-
-      const threadgroup T* As__ = As + offset_a.y * lda_tgp + offset_a.x;
-      const threadgroup T* Bs__ = Bs + offset_b.y * ldb_tgp + offset_b.x;
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Load elements from threadgroup A as simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short i = 0; i < TM; i++) {
-        Asimd[i].thread_elements()[0] = static_cast<AccumType>(As__[0]);
-        Asimd[i].thread_elements()[1] = static_cast<AccumType>(As__[jump_a]);
-        As__ += simd_stride_a;
-      }
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Load elements from threadgroup B as simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short j = 0; j < TN; j++) {
-        Bsimd[j].thread_elements()[0] = static_cast<AccumType>(Bs__[0]);
-        Bsimd[j].thread_elements()[1] = static_cast<AccumType>(Bs__[jump_b]);
-        Bs__ += simd_stride_b;
-      }
-
-      simdgroup_barrier(mem_flags::mem_none);
-// Multiply and accumulate into resulr simdgroup matrices
-#pragma clang loop unroll(full)
-      for (short i = 0; i < TM; i++) {
-#pragma clang loop unroll(full)
-        for (short j = 0; j < TN; j++) {
-          simdgroup_multiply_accumulate(
-              results[i * TN + j], Asimd[i], Bsimd[j], results[i * TN + j]);
-        }
-      }
-    }
-  }
-
-  /* Store results from simdgroup_matrix results into device memory */
-  METAL_FUNC void store_result(device T* C, const int ldc) const {
-#pragma clang loop unroll(full)
-    for (int i = 0; i < TM; i++) {
-#pragma clang loop unroll(full)
-      for (int j = 0; j < TN; j++) {
-        C[(i * TM_stride + sm + tm) * ldc + j * TN_stride + tn + sn] =
-            Epilogue::apply(results[i * TN + j].thread_elements()[0]);
-        C[(i * TM_stride + sm + tm) * ldc + j * TN_stride + tn + sn + 1] =
-            Epilogue::apply(results[i * TN + j].thread_elements()[1]);
-      }
-    }
-  }
-
-  METAL_FUNC void
-  store_result_safe(device T* C, const int ldc, short2 dst_tile_dims) const {
-#pragma clang loop unroll(full)
-    for (int i = 0; i < TM; i++) {
-      if (tm + i * TM_stride + sm < dst_tile_dims.y) {
-#pragma clang loop unroll(full)
-        for (int j = 0; j < TN; j++) {
-          if (tn + j * TN_stride + sn < dst_tile_dims.x) {
-            C[(tm + i * TM_stride + sm) * ldc + tn + j * TN_stride + sn] =
-                Epilogue::apply(results[i * TN + j].thread_elements()[0]);
-          }
-
-          if (tn + j * TN_stride + sn + 1 < dst_tile_dims.x) {
-            C[(tm + i * TM_stride + sm) * ldc + tn + j * TN_stride + sn + 1] =
-                Epilogue::apply(results[i * TN + j].thread_elements()[1]);
-          }
-        }
-      }
-    }
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// GEMM kernels
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename T,
-    int BM,
-    int BN,
-    int BK,
-    int WM,
-    int WN,
-    bool transpose_a,
-    bool transpose_b,
-    bool MN_aligned,
-    bool K_aligned,
-    typename AccumType = typename AccumHelper<T>::accum_type,
-    typename Epilogue = TransformNone<T, AccumType>>
-struct GEMMKernel {
-  MLX_MTL_CONST short tgp_padding_a = 16 / sizeof(T);
-  MLX_MTL_CONST short tgp_padding_b = 16 / sizeof(T);
-  MLX_MTL_CONST short tgp_mem_size_a =
-      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
-  MLX_MTL_CONST short tgp_mem_size_b =
-      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
-  MLX_MTL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;
-
-  MLX_MTL_CONST short tgp_size = WM * WN * 32;
-  MLX_MTL_CONST short vec_size = (BM == 64 && BN == 64) ? 8 : 4;
-
-  using loader_a_t = BlockLoader<
-      T,
-      BM,
-      BK,
-      BK,
-      vec_size,
-      tgp_size,
-      transpose_a,
-      true,
-      tgp_padding_a>;
-  using loader_b_t = BlockLoader<
-      T,
-      BK,
-      BN,
-      BK,
-      vec_size,
-      tgp_size,
-      transpose_b,
-      false,
-      tgp_padding_b>;
-  using mma_t = BlockMMA<
-      T,
-      BM,
-      BN,
-      BK,
-      WM,
-      WN,
-      transpose_a,
-      transpose_b,
-      tgp_padding_a,
-      tgp_padding_b,
-      AccumType,
-      Epilogue>;
-
-  /* Main kernel function */
-  static METAL_FUNC void run(
-      const device T* A [[buffer(0)]],
-      const device T* B [[buffer(1)]],
-      device T* C [[buffer(2)]],
-      const constant int& M [[buffer(3)]],
-      const constant int& N [[buffer(4)]],
-      const constant int& K [[buffer(5)]],
-      const constant int& batch_stride_a [[buffer(6)]],
-      const constant int& batch_stride_b [[buffer(7)]],
-      const constant int& batch_stride_c [[buffer(8)]],
-      threadgroup T* tgp_memory [[threadgroup(0)]],
-      uint simd_lane_id [[thread_index_in_simdgroup]],
-      uint simd_group_id [[simdgroup_index_in_threadgroup]],
-      uint3 tid [[threadgroup_position_in_grid]],
-      uint3 lid [[thread_position_in_threadgroup]]) {
-    // Pacifying compiler
-    (void)lid;
-
-    // Adjust for batch
-    A += batch_stride_a * tid.z;
-    B += batch_stride_b * tid.z;
-    C += batch_stride_c * tid.z;
-
-    // Adjust for transpose
-    const int lda_dev = transpose_a ? M : K;
-    const int ldb_dev = transpose_b ? K : N;
-
-    // Find block in A, B, C
-    const int c_row = tid.y * BM;
-    const int c_col = tid.x * BN;
-
-    A += transpose_a ? c_row : c_row * K;
-    B += transpose_b ? c_col * K : c_col;
-    C += c_row * N + c_col;
-
-    // Prepare threadgroup memory for loading
-    threadgroup T* As = tgp_memory;
-    threadgroup T* Bs = tgp_memory + tgp_mem_size_a;
-
-    // Prepare threadgroup loading operations
-    loader_a_t loader_a(A, lda_dev, As, simd_group_id, simd_lane_id);
-    loader_b_t loader_b(B, ldb_dev, Bs, simd_group_id, simd_lane_id);
-
-    // Prepare threadgroup mma operation
-    mma_t mma_op(simd_group_id, simd_lane_id);
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // MNK aligned loop
-    if (MN_aligned && K_aligned) {
-      for (int k = 0; k < K; k += BK) {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        // Load elements into threadgroup
-        loader_a.load_unsafe();
-        loader_b.load_unsafe();
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // Multiply and accumulate threadgroup elements
-        mma_op.mma(As, Bs);
-
-        // Prepare for next iteration
-        loader_a.next();
-        loader_b.next();
-      }
-
-      threadgroup_barrier(mem_flags::mem_none);
-
-      // Store results to device memory
-      mma_op.store_result(C, N);
-      return;
-
-    }
-    ///////////////////////////////////////////////////////////////////////////////
-    // MN aligned, K unaligned loop
-    else if (MN_aligned && !K_aligned) {
-      // Main loop
-      int k = 0;
-      for (; k + BK <= K; k += BK) {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        // Load elements into threadgroup
-        loader_a.load_unsafe();
-        loader_b.load_unsafe();
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // Multiply and accumulate threadgroup elements
-        mma_op.mma(As, Bs);
-
-        // Prepare for next iteration
-        loader_a.next();
-        loader_b.next();
-      }
-
-      // Loop tail
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      loader_a.load_safe(short2(K - k, BM));
-      loader_b.load_safe(short2(BN, K - k));
-
-      threadgroup_barrier(mem_flags::mem_threadgroup);
-
-      mma_op.mma(As, Bs);
-
-      // Store results to device memory
-      mma_op.store_result(C, N);
-      return;
-
-    }
-    ///////////////////////////////////////////////////////////////////////////////
-    // MNK unaligned loop
-    else { // Loop over K - unaligned case
-
-      short2 src_tile_dims(min(BN, N - c_col), min(BM, M - c_row));
-
-      if (src_tile_dims.y == BM && src_tile_dims.x == BN) {
-        int k = 0;
-        for (; k + BK <= K; k += BK) {
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-          // Load elements into threadgroup
-          loader_a.load_unsafe();
-          loader_b.load_unsafe();
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          // Multiply and accumulate threadgroup elements
-          mma_op.mma(As, Bs);
-
-          // Prepare for next iteration
-          loader_a.next();
-          loader_b.next();
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-
-        if (k < K) {
-          loader_a.load_safe(short2(K - k, BM));
-          loader_b.load_safe(short2(BN, K - k));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          mma_op.mma(As, Bs);
-        }
-
-        mma_op.store_result(C, N);
-        return;
-
-      } else {
-        int k = 0;
-        for (; k + BK <= K; k += BK) {
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-          // Load elements into threadgroup
-          loader_a.load_safe(short2(BK, src_tile_dims.y));
-          loader_b.load_safe(short2(src_tile_dims.x, BK));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          // Multiply and accumulate threadgroup elements
-          mma_op.mma(As, Bs);
-
-          // Prepare for next iteration
-          loader_a.next();
-          loader_b.next();
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-
-        if (k < K) {
-          loader_a.load_safe(short2(K - k, src_tile_dims.y));
-          loader_b.load_safe(short2(src_tile_dims.x, K - k));
-
-          threadgroup_barrier(mem_flags::mem_threadgroup);
-
-          mma_op.mma(As, Bs);
-        }
-
-        threadgroup_barrier(mem_flags::mem_none);
-        mma_op.store_result_safe(C, N, src_tile_dims);
-
-        return;
-      }
-    }
-  }
-};
--- a/Show More
+++ b/Show More