Bump version (#1761 )

use sdpa and exportable functions in transformer multi head attention (#1760 )
Fix batched qmv bug (#1758 )
2025-09-04 23:24:41 +08:00 · 2025-01-09 13:48:20 -08:00 · 2025-01-09 13:11:55 -08:00 · 2025-01-09 11:45:57 -08:00 · 2025-01-09 11:23:19 -08:00 · 2025-01-09 11:04:24 -08:00
527 changed files with 97273 additions and 18228 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,5 +1,8 @@
 version: 2.1

+orbs:
+  apple: ml-explore/pr-approval@0.1.0
+
 parameters:
  nightly_build:
    type: boolean
@@ -7,8 +10,65 @@ parameters:
  weekly_build:
    type: boolean
    default: false
+  test_release:
+    type: boolean
+    default: false
+  linux_release:
+    type: boolean
+    default: false

 jobs:
+  build_documentation:
+    parameters:
+      upload-docs:
+        type: boolean
+        default: false
+    macos:
+      xcode: "15.2.0"
+    resource_class: macos.m1.medium.gen1
+    steps:
+      - checkout
+      - run:
+          name: Install
+          command: |
+            brew install python@3.9
+            brew install doxygen
+            python3.9 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install -r docs/requirements.txt
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+      - when:
+          condition:
+            not: << parameters.upload-docs >>
+          steps:
+            - run:
+               name: Build documentation
+               command: |
+                 source env/bin/activate
+                 cd docs && doxygen && make html O=-W
+      - when:
+          condition: << parameters.upload-docs >>
+          steps:
+            - add_ssh_keys:
+                fingerprints:
+                  - "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
+            - run:
+               name: Upload documentation
+               command: |
+                 source env/bin/activate
+                 git config user.email "mlx@group.apple.com"
+                 git config user.name "CircleCI Docs"
+                 git checkout gh-pages
+                 git rebase main
+                 cd docs
+                 git rm -rf build/html
+                 doxygen && make html O=-W
+                 git add -f build/html
+                 git commit -m "rebase"
+                 git push -f origin gh-pages
+
  linux_build_and_test:
    docker:
      - image: cimg/python:3.9
@@ -25,176 +85,262 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
+            pip install nanobind==2.4.0
            pip install numpy
            sudo apt-get update
-            sudo apt-get install libblas-dev
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py build_ext --inplace
-            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" CMAKE_BUILD_PARALLEL_LEVEL="" python3 setup.py develop
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py build_ext --inplace
+            CMAKE_ARGS="-DMLX_BUILD_METAL=OFF" \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python3 setup.py develop
      - run:
-          name: Run the python tests
+          name: Generate package stubs
          command: |
-            python3 -m unittest discover python/tests
+            echo "stubs"
+            pip install typing_extensions
+            python setup.py generate_stubs 
+      - run:
+          name: Run Python tests
+          command: |
+            python3 -m unittest discover python/tests -v
      - run:
          name: Build CPP only
          command: |
-            mkdir -p build && cd build && cmake .. -DMLX_BUILD_METAL=OFF && make -j
+            mkdir -p build && cd build 
+            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+            make -j `nproc`
      - run:
          name: Run CPP tests
          command: ./build/tests/tests

  mac_build_and_test:
-    machine: true
-    resource_class: ml-explore/m-builder
+    parameters:
+      xcode_version:
+        type: string
+        default: "15.2.0"
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=3.9
-            conda activate runner-env
+            brew install python@3.9
+            brew install openmpi
+            python3.9 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
+            pip install nanobind==2.4.0
            pip install numpy
            pip install torch
+            pip install tensorflow
            pip install unittest-xml-reporting
      - run:
-          name: Build python package
+          name: Install Python package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py build_ext --inplace
-            CMAKE_BUILD_PARALLEL_LEVEL="" python setup.py develop
+            source env/bin/activate
+            DEBUG=1 CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install -e . -v
      - run:
-          name: Run the python tests
+          name: Generate package stubs
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
-            DEVICE=gpu python -m xmlrunner discover -v python/tests -o test-results/gpu
+            source env/bin/activate
+            pip install typing_extensions
+            python setup.py generate_stubs 
+      - run:
+          name: Run Python tests
+          command: |
+            source env/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+      - run:
+          name: Build example extension
+          command: |
+            source env/bin/activate
+            cd examples/extensions
+            pip install -r requirements.txt
+            python setup.py build_ext -j8
      - store_test_results:
          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            source env/bin/activate
+            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run CPP tests
+          command: |
+            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
+      - run:
+          name: Build small binary
+          command: |
+            source env/bin/activate
+            cd build/
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+              -DBUILD_SHARED_LIBS=ON \
+              -DMLX_BUILD_CPU=OFF \
+              -DMLX_BUILD_SAFETENSORS=OFF \
+              -DMLX_BUILD_GGUF=OFF \
+              -DMLX_METAL_JIT=ON
+            make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run Python tests with JIT
+          command: |
+            source env/bin/activate
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              pip install -e . -v
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
+              METAL_DEBUG_ERROR_MODE=0 \
+              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit

  build_release:
-    machine: true
-    resource_class: ml-explore/m-builder
    parameters:
      python_version:
        type: string
        default: "3.9"
-      macos_version:
+      xcode_version:
        type: string
-        default: "14"
+        default: "15.2.0"
+      build_env:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: macos.m1.medium.gen1
    steps:
      - checkout
      - run:
          name: Install dependencies
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
+            brew install python@<< parameters.python_version >>
+            brew install openmpi
+            python<< parameters.python_version >> -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
            pip install numpy
            pip install twine
+            pip install build
      - run:
-          name: Build pacakge
+          name: Install Python package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              PYPI_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
-            twine upload dist/* --repository mlx
+            source env/bin/activate
+            DEV_RELEASE=1 \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              pip install . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            source env/bin/activate
+            pip install typing_extensions
+            python setup.py generate_stubs 
+      - run:
+          name: Build Python package
+          command: |
+            source env/bin/activate
+            << parameters.build_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` \
+              python -m build -w
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  source env/bin/activate
+                  twine upload dist/*
      - store_artifacts:
          path: dist/

-  build_dev_release:
-    machine: true
-    resource_class: ml-explore/m-builder
+  build_linux_release:
    parameters:
      python_version:
        type: string
        default: "3.9"
-      macos_version:
+      extra_env:
        type: string
-        default: "14"
+        default: "DEV_RELEASE=1"
+    docker:
+      - image: ubuntu:20.04
    steps:
      - checkout
      - run:
-          name: Install dependencies
+          name: Build wheel
          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
+            PYTHON=python<< parameters.python_version >>
+            apt-get update
+            apt-get upgrade -y
+            DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+            apt-get install -y apt-utils
+            apt-get install -y software-properties-common
+            add-apt-repository -y ppa:deadsnakes/ppa
+            apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            apt-get install -y build-essential git
+            $PYTHON -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
            pip install numpy
+            pip install auditwheel
+            pip install patchelf
+            pip install build
            pip install twine
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              pip install . -v
+            pip install typing_extensions
+            python setup.py generate_stubs 
+            << parameters.extra_env >> \
+              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
+              python -m build --wheel
+            auditwheel show dist/*
+            auditwheel repair dist/* --plat manylinux_2_31_x86_64
      - run:
-          name: Build pacakge
+          name: Upload package
          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              DEV_RELEASE=1 \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
-            twine upload dist/* --repository mlx
+            source env/bin/activate
+            twine upload wheelhouse/*
      - store_artifacts:
-          path: dist/
-
-  build_package:
-    machine: true
-    resource_class: ml-explore/m-builder
-    parameters:
-      python_version:
-        type: string
-        default: "3.9"
-      macos_version:
-        type: string
-        default: "14"
-    steps:
-      - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            eval "$(conda shell.bash hook)"
-            rm -r $CONDA_PREFIX/envs/runner-env
-            conda create -y -n runner-env python=<< parameters.python_version >>
-            conda activate runner-env
-            pip install --upgrade cmake
-            pip install --upgrade pybind11[global]
-            pip install numpy
-            pip install twine
-      - run:
-          name: Build pacakge
-          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate runner-env
-            DEVELOPER_DIR=$(developer_dir_macos_<< parameters.macos_version >>) \
-              CMAKE_BUILD_PARALLEL_LEVEL="" \
-              python setup.py bdist_wheel
-      - store_artifacts:
-          path: dist/
+          path: wheelhouse/

 workflows:
  build_and_test:
    when:
      and:
+        - matches:
+            pattern: "^(?!pull/)[-\\w]+$"
+            value: << pipeline.git.branch >>
        - not: << pipeline.parameters.nightly_build >>
        - not: << pipeline.parameters.weekly_build >>
+        - not: << pipeline.parameters.test_release >>
    jobs:
+      - mac_build_and_test:
+          matrix:
+            parameters:
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
      - linux_build_and_test
-      - mac_build_and_test
+      - build_documentation 
+
+  build_pypi_release:
+    when:
+      and:
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.weekly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
      - build_release:
          filters:
            tags:
@@ -203,21 +349,65 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              xcode_version: ["15.0.0", "15.2.0"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_documentation:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          upload-docs: true
+
+  prb:
+    when:
+      matches:
+        pattern: "^pull/\\d+(/head)?$"
+        value: << pipeline.git.branch >>
+    jobs:
+      - hold:
+          type: approval
+      - apple/authenticate:
+          context: pr-approval
+      - mac_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
+      - linux_build_and_test:
+          requires: [ hold ]
  nightly_build:
-    when: << pipeline.parameters.nightly_build >>
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.nightly_build >>
    jobs:
-      - build_package:
+      - build_release:
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              xcode_version: ["15.0.0", "15.2.0"]
  weekly_build:
-    when: << pipeline.parameters.weekly_build >>
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.weekly_build >>
    jobs:
-      - build_dev_release:
+      - build_release:
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-              macos_version: ["13", "14"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
+              build_env: ["DEV_RELEASE=1"]
+  linux_test_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.linux_release >>
+    jobs:
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              extra_env: ["PYPI_RELEASE=1"]
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,28 @@
+---
+name: Bug report
+about: Create a report about an issue you've encountered
+title: "[BUG] "
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+
+Include code snippet
+```python
+
+```
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Desktop (please complete the following information):**
+ - OS Version: [e.g. MacOS 14.1.2]
+ - Version [e.g. 0.7.0]
+
+**Additional context**
+Add any other context about the problem here.
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,12 @@
+## Proposed changes
+
+Please include a description of the problem or feature this PR is addressing. If there is a corresponding issue, include the issue #.
+
+## Checklist
+
+Put an `x` in the boxes that apply.
+
+- [ ] I have read the [CONTRIBUTING](https://github.com/ml-explore/mlx/blob/main/CONTRIBUTING.md) document
+- [ ] I have run `pre-commit run --all-files` to format my code / installed pre-commit prior to committing changes
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] I have updated the necessary documentation (if needed)
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -0,0 +1,20 @@
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  check_lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit black isort clang-format
+      - name: Run lint
+        run: |
+          pre-commit run --all-files
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,16 @@ __pycache__/
 # C extensions
 *.so

+# tensor files
+*.safe
+*.safetensors
+
 # Metal libraries
 *.metallib
 venv/

 # Distribution / packaging
+python/mlx/core
 python/mlx/share
 python/mlx/include
 .Python
@@ -71,6 +76,12 @@ build/
 *.out
 *.app

+# Debug symbols
+*.pdb
+
 # VSCode 
 .vscode/
 .DS_Store
+
+# Jetbrains
+.cache
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,9 +1,21 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v14.0.6
+    rev: v19.1.4
    hooks:
    -   id: clang-format
-   repo: https://github.com/psf/black
-    rev: 22.10.0
+# Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+-   repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.10.0
    hooks:
    -   id: black
+    
+-   repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+    -   id: isort
+        args:
+            - --profile=black
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.13
+  hooks:
+    - id: cmake-format
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -1,3 +1,31 @@
+# Individual Contributors
+
+If you wish to be acknowledged for your contributions, please list your name
+with a short description of your contribution(s) below. For example:
+
+- Jane Smith: Added the `foo` and `bar` ops.
+
+MLX was developed with contributions from the following individuals:
+
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`.
+- Juarez Bochi: Fixed bug in cross attention.
+- Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
+- Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer. Implemented pooling layers and ``Upsample``.
+- Hinrik Snær Guðmundsson: Added `atleast_1d`, `atleast_2d`, `atleast_3d` ops.
+- Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
+- Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
+- AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
+- Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
+- Paul Paczuski: Improved stability of BCE loss calculation
+- Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
+
+<a href="https://github.com/ml-explore/mlx/graphs/contributors">
+  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
+</a>
+
+# Third-Party Software
+
 MLX leverages several third-party software, listed here together with
 their license copied verbatim.

--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,24 @@
+cff-version: 1.2.0
+title: mlx
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Awni
+    family-names: Hannun
+    affiliation: Apple
+  - given-names: Jagrit
+    family-names: Digani
+    affiliation: Apple
+  - given-names: Angelos
+    family-names: Katharopoulos
+    affiliation: Apple
+  - given-names: Ronan
+    family-names: Collobert
+    affiliation: Apple
+repository-code: 'https://github.com/ml-explore'
+abstract: >-
+  MLX: efficient and flexible machine learning on Apple
+  silicon
+license: MIT
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
-cmake_minimum_required(VERSION 3.24)
+cmake_minimum_required(VERSION 3.25)

-project(mlx LANGUAGES CXX)
+project(mlx LANGUAGES C CXX)

 # ----------------------------- Setup -----------------------------
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -15,10 +15,46 @@ option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
 option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
 option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
+option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
+option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
+option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
+option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
+option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
+option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.0.3)
+  set(MLX_VERSION 0.22.0)
+endif()
+add_compile_definitions("MLX_VERSION=${MLX_VERSION}")
+
+# --------------------- Processor tests -------------------------
+
+message(
+  STATUS
+    "Building MLX for ${CMAKE_SYSTEM_PROCESSOR} processor on ${CMAKE_SYSTEM_NAME}"
+)
+
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+    if(NOT MLX_ENABLE_X64_MAC)
+      message(
+        FATAL_ERROR
+          "Building for x86_64 on macOS is not supported."
+          " If you are on an Apple silicon system, check the build"
+          " documentation for possible fixes: "
+          "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source"
+      )
+    else()
+      set(MLX_BUILD_METAL OFF)
+      message(WARNING "Building for x86_64 arch is not officially supported.")
+    endif()
+  endif()
+
+else()
+  set(MLX_BUILD_METAL OFF)
+  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
 endif()

 # ----------------------------- Lib -----------------------------
@@ -29,102 +65,196 @@ cmake_policy(SET CMP0135 NEW)

 add_library(mlx)

-if (MLX_BUILD_METAL)
-  find_library(METAL_LIB Metal)
-  find_library(FOUNDATION_LIB Foundation)
-  find_library(QUARTZ_LIB QuartzCore)
+if(MLX_BUILD_METAL)
+  set(METAL_LIB "-framework Metal")
+  set(FOUNDATION_LIB "-framework Foundation")
+  set(QUARTZ_LIB "-framework QuartzCore")
 endif()

-if (MLX_BUILD_METAL AND NOT METAL_LIB)
+if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
-elseif (MLX_BUILD_METAL)
+  set(MLX_BUILD_METAL OFF)
+  set(MLX_METAL_DEBUG OFF)
+elseif(MLX_BUILD_METAL)
  message(STATUS "Building METAL sources")
-  add_compile_definitions(_METAL_)

-  execute_process(COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-                  OUTPUT_VARIABLE MACOS_VERSION)
-
-  message(STATUS "Building with SDK for MacOS version ${MACOS_VERSION}")
-  
-  if (${MACOS_VERSION} GREATER_EQUAL 14.2)
-    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
-  elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
-    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
-  elseif (${MACOS_VERSION} GREATER_EQUAL 13.3)
-    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13.3_iOS16.4.zip)
-  else()
-    message(FATAL_ERROR "MLX requires MacOS >= 13.4 to be built with MLX_BUILD_METAL=ON" )
+  if(MLX_METAL_DEBUG)
+    add_compile_definitions(MLX_METAL_DEBUG)
  endif()

-  FetchContent_Declare(
-    metal_cpp
-    URL ${METAL_CPP_URL}
-  )
+  # Throw an error if xcrun not found
+  execute_process(
+    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
+    OUTPUT_VARIABLE MACOS_SDK_VERSION COMMAND_ERROR_IS_FATAL ANY)
+
+  if(${MACOS_SDK_VERSION} LESS 14.0)
+    message(
+      FATAL_ERROR
+        "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON")
+  endif()
+  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")
+
+  set(METAL_CPP_URL
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)
+
+  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
+    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
+  endif()
+  execute_process(
+    COMMAND
+      zsh "-c"
+      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
+    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
+  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})

  FetchContent_MakeAvailable(metal_cpp)
  target_include_directories(
-    mlx PUBLIC
-    $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include/metal_cpp>
-  )
-  target_link_libraries(
-    mlx
-    ${METAL_LIB}
-    ${FOUNDATION_LIB}
-    ${QUARTZ_LIB})
+    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
+               $<INSTALL_INTERFACE:include/metal_cpp>)
+  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

-find_library(ACCELERATE_LIBRARY Accelerate)
-if (ACCELERATE_LIBRARY)
-  message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
-  set(MLX_BUILD_ACCELERATE ON)
-  target_link_libraries(mlx ${ACCELERATE_LIBRARY})
-  add_compile_definitions(ACCELERATE_NEW_LAPACK)
-else()
-  message(STATUS "Accelerate not found, using default backend.")
-  set(MLX_BUILD_ACCELERATE OFF)
-  #set(BLA_VENDOR Generic)
-  find_package(BLAS REQUIRED)
-  if (NOT BLAS_FOUND)
-    message(FATAL_ERROR "Must have BLAS installed")
+if(WIN32)
+  if(MSVC)
+    # GGUF does not build with MSVC.
+    set(MLX_BUILD_GGUF OFF)
+    # There is no prebuilt OpenBLAS distribution for MSVC.
+    set(MLX_BUILD_BLAS_FROM_SOURCE ON)
+  endif()
+  # Windows implementation of dlfcn.h APIs.
+  FetchContent_Declare(
+    dlfcn-win32
+    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+    GIT_TAG v1.4.1
+    EXCLUDE_FROM_ALL)
+  block()
+  set(BUILD_SHARED_LIBS OFF)
+  FetchContent_MakeAvailable(dlfcn-win32)
+  endblock()
+  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
+  target_link_libraries(mlx PRIVATE dl)
+endif()
+
+if(MLX_BUILD_CPU)
+  find_library(ACCELERATE_LIBRARY Accelerate)
+  if(ACCELERATE_LIBRARY)
+    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
+    set(MLX_BUILD_ACCELERATE ON)
+  else()
+    message(STATUS "Accelerate or arm neon not found, using default backend.")
+    set(MLX_BUILD_ACCELERATE OFF)
+  endif()
+
+  if(MLX_BUILD_ACCELERATE)
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
+    add_compile_definitions(ACCELERATE_NEW_LAPACK)
+  elseif(MLX_BUILD_BLAS_FROM_SOURCE)
+    # Download and build OpenBLAS from source code.
+    FetchContent_Declare(
+      openblas
+      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+      GIT_TAG v0.3.28
+      EXCLUDE_FROM_ALL)
+    set(BUILD_STATIC_LIBS ON) # link statically
+    set(NOFORTRAN ON) # msvc has no fortran compiler
+    FetchContent_MakeAvailable(openblas)
+    target_link_libraries(mlx PRIVATE openblas)
+    target_include_directories(
+      mlx PRIVATE "${openblas_SOURCE_DIR}/lapack-netlib/LAPACKE/include"
+                  "${CMAKE_BINARY_DIR}/generated" "${CMAKE_BINARY_DIR}")
+  else()
+    if(${CMAKE_HOST_APPLE})
+      # The blas shipped in macOS SDK is not supported, search homebrew for
+      # openblas instead.
+      set(BLA_VENDOR OpenBLAS)
+      set(LAPACK_ROOT
+          "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
+    endif()
+    # Search and link with lapack.
+    find_package(LAPACK REQUIRED)
+    if(NOT LAPACK_FOUND)
+      message(FATAL_ERROR "Must have LAPACK installed")
+    endif()
+    find_path(LAPACK_INCLUDE_DIRS lapacke.h /usr/include /usr/local/include
+              /usr/local/opt/openblas/include)
+    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
+    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+    target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
+    # List blas after lapack otherwise we may accidentally incldue an old
+    # version of lapack.h from the include dirs of blas.
+    find_package(BLAS REQUIRED)
+    if(NOT BLAS_FOUND)
+      message(FATAL_ERROR "Must have BLAS installed")
+    endif()
+    # TODO find a cleaner way to do this
+    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include /usr/local/include
+              $ENV{BLAS_HOME}/include)
+    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
+    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
+    target_link_libraries(mlx PRIVATE ${BLAS_LIBRARIES})
+  endif()
+else()
+  set(MLX_BUILD_ACCELERATE OFF)
+endif()
+
+find_package(MPI)
+if(MPI_FOUND)
+  execute_process(
+    COMMAND zsh "-c" "mpirun --version"
+    OUTPUT_VARIABLE MPI_VERSION
+    ERROR_QUIET)
+  if(${MPI_VERSION} MATCHES ".*Open MPI.*")
+    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
+  elseif(MPI_VERSION STREQUAL "")
+    set(MPI_FOUND FALSE)
+    message(
+      WARNING "MPI found but mpirun is not available. Building without MPI.")
+  else()
+    set(MPI_FOUND FALSE)
+    message(WARNING "MPI which is not OpenMPI found. Building without MPI.")
  endif()
-  # TODO find a cleaner way to do this
-  find_path(BLAS_INCLUDE_DIRS cblas.h
-    /usr/include
-    /usr/local/include
-    $ENV{BLAS_HOME}/include)
-  message(STATUS ${BLAS_LIBRARIES})
-  message(STATUS ${BLAS_INCLUDE_DIRS})
-  target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-  target_link_libraries(mlx ${BLAS_LIBRARIES})
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

 target_include_directories(
-  mlx 
-  PUBLIC
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-  $<INSTALL_INTERFACE:include>
-)
+  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+             $<INSTALL_INTERFACE:include>)

-if (MLX_BUILD_PYTHON_BINDINGS)
+FetchContent_Declare(
+  fmt
+  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+  GIT_TAG 10.2.1
+  EXCLUDE_FROM_ALL)
+FetchContent_MakeAvailable(fmt)
+target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)
+
+if(MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
-  find_package(Python COMPONENTS Interpreter Development)
-  find_package(pybind11 CONFIG REQUIRED)
+  find_package(
+    Python 3.8
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE nanobind_ROOT)
+  find_package(nanobind CONFIG REQUIRED)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
 endif()

-if (MLX_BUILD_TESTS)
+if(MLX_BUILD_TESTS)
  include(CTest)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tests)
 endif()

-if (MLX_BUILD_EXAMPLES)
+if(MLX_BUILD_EXAMPLES)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples/cpp)
 endif()

-if (MLX_BUILD_BENCHMARKS)
+if(MLX_BUILD_BENCHMARKS)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/benchmarks/cpp)
 endif()

@@ -133,32 +263,31 @@ include(GNUInstallDirs)

 # Install library
 install(
-    TARGETS mlx
-    EXPORT MLXTargets
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-
+  TARGETS mlx
+  EXPORT MLXTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  INCLUDES
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

 # Install headers
 install(
-    DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    COMPONENT headers
-    FILES_MATCHING PATTERN "*.h"
-)
+  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT headers
+  FILES_MATCHING
+  PATTERN "*.h"
+  PATTERN "backend/metal/kernels.h" EXCLUDE)

 # Install metal dependencies
-if (MLX_BUILD_METAL)
+if(MLX_BUILD_METAL)

  # Install metal cpp
  install(
-      DIRECTORY ${metal_cpp_SOURCE_DIR}/
-      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
-      COMPONENT metal_cpp_source
-  )
+    DIRECTORY ${metal_cpp_SOURCE_DIR}/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
+    COMPONENT metal_cpp_source)

 endif()

@@ -170,31 +299,24 @@ set(MLX_CMAKE_INSTALL_MODULE_DIR share/cmake/MLX)
 install(
  EXPORT MLXTargets
  FILE MLXTargets.cmake
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

 include(CMakePackageConfigHelpers)

 write_basic_package_version_file(
  ${MLX_CMAKE_BUILD_VERSION_CONFIG}
  COMPATIBILITY SameMajorVersion
-  VERSION ${MLX_VERSION}
-)
+  VERSION ${MLX_VERSION})

 configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in
-  ${MLX_CMAKE_BUILD_CONFIG}
+  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in ${MLX_CMAKE_BUILD_CONFIG}
  INSTALL_DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
  NO_CHECK_REQUIRED_COMPONENTS_MACRO
-  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR MLX_CMAKE_INSTALL_MODULE_DIR
-)
+  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR
+            MLX_CMAKE_INSTALL_MODULE_DIR)

-install(
-  FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+install(FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

-install(
-  DIRECTORY ${CMAKE_MODULE_PATH}/
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+install(DIRECTORY ${CMAKE_MODULE_PATH}/
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 include CMakeLists.txt
 recursive-include mlx/ *
 include python/src/*
+include python/mlx/py.typed # support type hinting as in PEP-561
--- a/README.md
+++ b/README.md
@@ -6,34 +6,36 @@

 [![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)

-MLX is an array framework for machine learning on Apple silicon, brought to you
-by Apple machine learning research.
+MLX is an array framework for machine learning on Apple silicon,
+brought to you by Apple machine learning research.

 Some key features of MLX include:

- - **Familiar APIs**: MLX has a Python API that closely follows NumPy.
-   MLX also has a fully featured C++ API, which closely mirrors the Python API. 
-   MLX has higher-level packages like `mlx.nn` and `mlx.optimizers` with APIs
-   that closely follow PyTorch to simplify building more complex models.
+ - **Familiar APIs**: MLX has a Python API that closely follows NumPy.  MLX
+   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
+   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
+   the Python API.  MLX has higher-level packages like `mlx.nn` and
+   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
+   more complex models.

- - **Composable function transformations**: MLX has composable function
+ - **Composable function transformations**: MLX supports composable function
   transformations for automatic differentiation, automatic vectorization,
   and computation graph optimization.

 - **Lazy computation**: Computations in MLX are lazy. Arrays are only
   materialized when needed.

- - **Dynamic graph construction**: Computation graphs in MLX are built
+ - **Dynamic graph construction**: Computation graphs in MLX are constructed
   dynamically. Changing the shapes of function arguments does not trigger
   slow compilations, and debugging is simple and intuitive.

 - **Multi-device**: Operations can run on any of the supported devices
-   (currently, the CPU and GPU).
+   (currently the CPU and the GPU).

 - **Unified memory**: A notable difference from MLX and other frameworks
   is the *unified memory model*. Arrays in MLX live in shared memory.
   Operations on MLX arrays can be performed on any of the supported
-   device types without moving data.
+   device types without transferring data.

 MLX is designed by machine learning researchers for machine learning
 researchers. The framework is intended to be user-friendly, but still efficient
@@ -53,7 +55,7 @@ variety of examples, including:

 - [Transformer language model](https://github.com/ml-explore/mlx-examples/tree/main/transformer_lm) training.
 - Large-scale text generation with
-  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llama) and
+  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llms/llama) and
  finetuning with [LoRA](https://github.com/ml-explore/mlx-examples/tree/main/lora).
 - Generating images with [Stable Diffusion](https://github.com/ml-explore/mlx-examples/tree/main/stable_diffusion).
 - Speech recognition with [OpenAI's Whisper](https://github.com/ml-explore/mlx-examples/tree/main/whisper).
@@ -61,22 +63,54 @@ variety of examples, including:
 ## Quickstart

 See the [quick start
-guide](https://ml-explore.github.io/mlx/build/html/quick_start.html)
+guide](https://ml-explore.github.io/mlx/build/html/usage/quick_start.html)
 in the documentation.

 ## Installation

-MLX is available on [PyPi](https://pypi.org/project/mlx/). To install the Python API, run:
+MLX is available on [PyPI](https://pypi.org/project/mlx/). To install the Python API, run:
+
+**With `pip`**:

 ```
 pip install mlx
 ```

+**With `conda`**:
+
+```
+conda install -c conda-forge mlx
+```
+
 Checkout the
 [documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
 for more information on building the C++ and Python APIs from source.

 ## Contributing 

-Check out the [contribution guidelines](CONTRIBUTING.md) for more information
-on contributing to MLX.
+Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
+on contributing to MLX. See the
+[docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
+information on building from source, and running tests.
+
+We are grateful for all of [our
+contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
+to MLX and wish to be acknowledged, please add your name to the list in your
+pull request.
+
+## Citing MLX
+
+The MLX software suite was initially developed with equal contribution by Awni
+Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
+MLX useful in your research and wish to cite it, please use the following
+BibTex entry:
+
+```
+@software{mlx2023,
+  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
+  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
+  url = {https://github.com/ml-explore},
+  version = {0.0},
+  year = {2023},
+}
+```
--- a/benchmarks/cpp/autograd.cpp
+++ b/benchmarks/cpp/autograd.cpp
@@ -5,35 +5,35 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_value_and_grad() {
-  auto x = ones({200, 1000});
-  eval(x);
-  auto fn = [](array x) {
+  auto x = mx::ones({200, 1000});
+  mx::eval(x);
+  auto fn = [](mx::array x) {
    for (int i = 0; i < 20; ++i) {
-      x = log(exp(x));
+      x = mx::log(mx::exp(x));
    }
-    return sum(x);
+    return mx::sum(x);
  };

-  auto grad_fn = grad(fn);
+  auto grad_fn = mx::grad(fn);
  auto independent_value_and_grad = [&]() {
    auto value = fn(x);
    auto dfdx = grad_fn(x);
-    return std::vector<array>{value, dfdx};
+    return std::vector<mx::array>{value, dfdx};
  };
  TIME(independent_value_and_grad);

-  auto value_and_grad_fn = value_and_grad(fn);
+  auto value_and_grad_fn = mx::value_and_grad(fn);
  auto combined_value_and_grad = [&]() {
    auto [value, dfdx] = value_and_grad_fn(x);
-    return std::vector<array>{value, dfdx};
+    return std::vector<mx::array>{value, dfdx};
  };
  TIME(combined_value_and_grad);
 }

 int main() {
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_value_and_grad();
 }
--- a/benchmarks/cpp/compare_devices.cpp
+++ b/benchmarks/cpp/compare_devices.cpp
@@ -4,21 +4,21 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_add_op() {
  std::vector<int> sizes(1, 1);
  for (int i = 0; i < 9; ++i) {
    sizes.push_back(10 * sizes.back());
  }
-  set_default_device(Device::cpu);
+  set_default_device(mx::Device::cpu);
  for (auto size : sizes) {
-    auto a = random::uniform({size});
-    auto b = random::uniform({size});
-    eval(a, b);
+    auto a = mx::random::uniform({size});
+    auto b = mx::random::uniform({size});
+    mx::eval(a, b);
    std::cout << "Size " << size << std::endl;
-    TIMEM("cpu", add, a, b, Device::cpu);
-    TIMEM("gpu", add, a, b, Device::gpu);
+    TIMEM("cpu", mx::add, a, b, mx::Device::cpu);
+    TIMEM("gpu", mx::add, a, b, mx::Device::gpu);
  }
 }

--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -6,105 +6,105 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_irregular_binary_ops_1D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
-  auto a = random::uniform({size});
-  auto b = random::uniform({size});
-  eval(a, b);
+  auto a = mx::random::uniform({size});
+  auto b = mx::random::uniform({size});
+  mx::eval(a, b);
  a = slice(a, {0}, {size}, {step});
  b = slice(b, {0}, {size}, {step});
-  TIMEM("1D strided", add, a, b, device);
+  TIMEM("1D strided", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_2D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 2048;
-  auto a = random::uniform({size, size});
-  auto b = random::uniform({size, size});
-  eval(a, b);
-  TIMEM("2D regular", add, a, b, device);
+  auto a = mx::random::uniform({size, size});
+  auto b = mx::random::uniform({size, size});
+  mx::eval(a, b);
+  TIMEM("2D regular", mx::add, a, b, device);

-  b = transpose(b);
-  eval(b);
-  TIMEM("2D transpose", add, a, b, device);
+  b = mx::transpose(b);
+  mx::eval(b);
+  TIMEM("2D mx::transpose", mx::add, a, b, device);

-  b = random::uniform({size});
-  eval(b);
-  TIMEM("2D broadcast dim 0", add, a, b, device);
+  b = mx::random::uniform({size});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 0", mx::add, a, b, device);

-  b = reshape(b, {size, 1});
-  eval(b);
-  TIMEM("2D broadcast dim 1", add, a, b, device);
+  b = mx::reshape(b, {size, 1});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 1", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_3D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int d0 = 32;
  int d1 = 512;
  int d2 = 512;
-  auto a = random::uniform({d0, d1, d2});
-  auto b = random::uniform({d0, d1, d2});
-  TIMEM("3D regular", add, a, b, device);
+  auto a = mx::random::uniform({d0, d1, d2});
+  auto b = mx::random::uniform({d0, d1, d2});
+  TIMEM("3D regular", mx::add, a, b, device);

-  b = transpose(b, {0, 2, 1});
-  TIMEM("3D transpose", add, a, b, device);
+  b = mx::transpose(b, {0, 2, 1});
+  TIMEM("3D mx::transpose", mx::add, a, b, device);

-  b = random::uniform({d1, d2});
-  TIMEM("3D broadcast dim 0", add, a, b, device);
+  b = mx::random::uniform({d1, d2});
+  TIMEM("3D broadcast dim 0", mx::add, a, b, device);

-  b = random::uniform({d0, 1, d2});
-  TIMEM("3D broadcast dim 1", add, a, b, device);
+  b = mx::random::uniform({d0, 1, d2});
+  TIMEM("3D broadcast dim 1", mx::add, a, b, device);

-  b = random::uniform({d0, d1, 1});
-  TIMEM("3D broadcast dim 2", add, a, b, device);
+  b = mx::random::uniform({d0, d1, 1});
+  TIMEM("3D broadcast dim 2", mx::add, a, b, device);

-  b = random::uniform({d2});
-  TIMEM("3D broadcast dims 0, 1", add, a, b, device);
+  b = mx::random::uniform({d2});
+  TIMEM("3D broadcast dims 0, 1", mx::add, a, b, device);

-  b = random::uniform({d1, 1});
-  TIMEM("3D broadcast dims 0, 2", add, a, b, device);
+  b = mx::random::uniform({d1, 1});
+  TIMEM("3D broadcast dims 0, 2", mx::add, a, b, device);

-  b = random::uniform({d0, 1, 1});
-  TIMEM("3D broadcast dims 1, 2", add, a, b, device);
+  b = mx::random::uniform({d0, 1, 1});
+  TIMEM("3D broadcast dims 1, 2", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_4D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  std::vector<int> shape = {8, 8, 512, 512};
-  auto a = random::uniform(shape);
-  auto b = random::uniform(shape);
+  auto a = mx::random::uniform(shape);
+  auto b = mx::random::uniform(shape);

-  TIMEM("4D regular", add, a, b, device);
+  TIMEM("4D regular", mx::add, a, b, device);

-  b = transpose(b, {0, 1, 3, 2});
-  TIMEM("4D transpose", add, a, b, device);
+  b = mx::transpose(b, {0, 1, 3, 2});
+  TIMEM("4D mx::transpose", mx::add, a, b, device);

  std::string om = "4D broadcast dims ";
  for (int i = 0; i < shape.size(); ++i) {
    shape[i] = 1;
-    b = random::uniform(shape);
+    b = mx::random::uniform(shape);
    std::ostringstream msg;
    msg << om << i;
-    TIMEM(msg.str(), add, a, b, device);
+    TIMEM(msg.str(), mx::add, a, b, device);

    for (int j = i + 1; j < shape.size(); ++j) {
      shape[j] = 1;
      std::ostringstream msg;
      msg << om << i << ", " << j;
-      b = random::uniform(shape);
-      TIMEM(msg.str(), add, a, b, device);
+      b = mx::random::uniform(shape);
+      TIMEM(msg.str(), mx::add, a, b, device);
      shape[j] = a.shape(j);

      for (int k = j + 1; k < shape.size(); ++k) {
        shape[k] = 1;
        std::ostringstream msg;
        msg << om << i << ", " << j << ", " << k;
-        b = random::uniform(shape);
-        TIMEM(msg.str(), add, a, b, device);
+        b = mx::random::uniform(shape);
+        TIMEM(msg.str(), mx::add, a, b, device);
        shape[k] = a.shape(k);
      }
    }
@@ -113,83 +113,83 @@ void time_irregular_binary_ops_4D() {
 }

 void time_irregular_reshape() {
-  auto device = default_device();
+  auto device = mx::default_device();
  std::vector<int> shape;
-  auto reshape_fn = [&shape, device](const array& a) {
-    return reshape(a, shape, device);
+  auto reshape_fn = [&shape, device](const mx::array& a) {
+    return mx::reshape(a, shape, device);
  };

  int size = 64;
  int d = 2 * size;

-  auto a = random::uniform({d, d, d});
+  auto a = mx::random::uniform({d, d, d});

  shape = {8 * size, size, size};
  TIMEM("3D contiguous", reshape_fn, a);

-  a = transpose(a);
+  a = mx::transpose(a);
  shape = {8 * size, size, size};
-  TIMEM("3D transpose", reshape_fn, a);
+  TIMEM("3D mx::transpose", reshape_fn, a);

-  a = transpose(a, {1, 2, 0});
+  a = mx::transpose(a, {1, 2, 0});
  shape = {8 * size, size, size};
-  TIMEM("3D transpose dims 1 2", reshape_fn, a);
+  TIMEM("3D mx::transpose dims 1 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, d}), {d, d, d});
  TIMEM("3D broadcast dim 0", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1, d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1, d}), {d, d, d});
  TIMEM("3D broadcast dim 1", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, d, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, d, 1}), {d, d, d});
  TIMEM("3D broadcast dim 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d}), {d, d, d});
  TIMEM("3D broadcast dims 0, 1", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1}), {d, d, d});
  TIMEM("3D broadcast dims 0, 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({1, 1, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({1, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2, 3", reshape_fn, a);
 }

 void time_irregular_astype_1D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
-  auto a = random::uniform({size});
+  auto a = mx::random::uniform({size});
  a = slice(a, {0}, {size}, {step});
-  TIMEM("1D strided", astype, a, int32, device);
+  TIMEM("1D strided", mx::astype, a, mx::int32, device);
 }

 void time_irregular_astype_2D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 2048;
  std::vector<int> shape = {size, size};

-  auto a = random::uniform(shape);
-  TIMEM("2D regular", astype, a, int32, device);
+  auto a = mx::random::uniform(shape);
+  TIMEM("2D regular", mx::astype, a, mx::int32, device);

-  a = transpose(a);
-  TIMEM("2D transpose", astype, a, int32, device);
+  a = mx::transpose(a);
+  TIMEM("2D mx::transpose", mx::astype, a, mx::int32, device);

-  a = broadcast_to(random::uniform({size}), shape);
-  TIMEM("2D broadcast dim 0", astype, a, int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size}), shape);
+  TIMEM("2D broadcast dim 0", mx::astype, a, mx::int32, device);

-  a = broadcast_to(random::uniform({size, 1}), shape);
-  TIMEM("2D broadcast dim 1", astype, a, int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size, 1}), shape);
+  TIMEM("2D broadcast dim 1", mx::astype, a, mx::int32, device);
 }

 int main(int argc, char** argv) {
  if (argc > 1) {
    bool use_gpu = !strcmp(argv[1], "gpu");
-    set_default_device(use_gpu ? Device::gpu : Device::cpu);
+    set_default_device(use_gpu ? mx::Device::gpu : mx::Device::cpu);
  }
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_irregular_binary_ops_1D();
  time_irregular_binary_ops_2D();
  time_irregular_binary_ops_3D();
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -3,20 +3,20 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_creation_ops() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto full_fp32 = [&]() { return full(shape, 3.3f); };
+  auto full_fp32 = [&]() { return mx::full(shape, 3.3f); };
  TIME(full_fp32);
-  auto zeros_fp32 = [&]() { return zeros(shape, float32); };
+  auto zeros_fp32 = [&]() { return mx::zeros(shape, mx::float32); };
  TIME(zeros_fp32);
-  auto ones_fp32 = [&]() { return ones(shape, float32); };
+  auto ones_fp32 = [&]() { return mx::ones(shape, mx::float32); };
  TIME(ones_fp32);

-  auto arange_fp32 = [&]() { return arange(0.0, 10.0, 1e-4); };
+  auto arange_fp32 = [&]() { return mx::arange(0.0, 10.0, 1e-4); };
  TIME(arange_fp32);
 }

@@ -24,188 +24,196 @@ void time_type_conversions() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto device = default_device();
+  auto device = mx::default_device();

-  auto a = zeros(shape, float32);
-  eval(a);
-  TIMEM("float32 to int32", astype, a, int32, device);
-  TIMEM("float32 to uint32", astype, a, uint32, device);
+  auto a = mx::zeros(shape, mx::float32);
+  mx::eval(a);
+  TIMEM("mx::float32 to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("mx::float32 to mx::uint32", mx::astype, a, mx::uint32, device);

-  a = zeros(shape, int32);
-  eval(a);
-  TIMEM("int32 to float32", astype, a, float32, device);
+  a = mx::zeros(shape, mx::int32);
+  mx::eval(a);
+  TIMEM("mx::int32 to mx::float32", mx::astype, a, mx::float32, device);

-  a = zeros(shape, bool_);
-  eval(a);
-  TIMEM("bool to float32", astype, a, float32, device);
-  TIMEM("bool to int32", astype, a, int32, device);
-  TIMEM("bool to uint32", astype, a, uint32, device);
+  a = mx::zeros(shape, mx::bool_);
+  mx::eval(a);
+  TIMEM("bool to mx::float32", mx::astype, a, mx::float32, device);
+  TIMEM("bool to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("bool to mx::uint32", mx::astype, a, mx::uint32, device);
 }

 void time_random_generation() {
  int M = 2000;
  int N = 500;

-  auto uniform = [&]() { return random::uniform({M, N}, float32); };
+  auto uniform = [&]() { return mx::random::uniform({M, N}, mx::float32); };
  TIME(uniform);
-  auto normal = [&]() { return random::normal({M, N}, float32); };
+  auto normal = [&]() { return mx::random::normal({M, N}, mx::float32); };
  TIME(normal);
 }

 void time_unary_ops() {
  int M = 2000;
  int N = 500;
-  auto device = default_device();
+  auto device = mx::default_device();

-  auto a = random::normal({M, N});
-  eval(a);
+  auto a = mx::random::normal({M, N});
+  mx::eval(a);
  TIME(mlx::core::abs, a, device);
-  TIME(negative, a, device);
-  TIME(sign, a, device);
-  TIME(square, a, device);
+  TIME(mx::negative, a, device);
+  TIME(mx::sign, a, device);
+  TIME(mx::square, a, device);
  TIME(mlx::core::sqrt, a, device);
-  TIME(rsqrt, a, device);
+  TIME(mx::rsqrt, a, device);
  TIME(mlx::core::exp, a, device);

-  a = random::uniform({M, N});
+  a = mx::random::uniform({M, N});
  TIME(mlx::core::log, a, device);
 }

 void time_binary_ops() {
  int M = 1000, N = 100, K = 10;
-  auto a = random::uniform({M, N, K});
-  auto b = random::uniform({M, N, K});
-  auto device = default_device();
-  eval(a, b);
+  auto condition = mx::random::randint(0, 2, {M, N, K});
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);

-  TIME(add, a, b, device);
-  TIME(subtract, a, b, device);
-  TIME(multiply, a, b, device);
-  TIME(divide, a, b, device);
-  TIME(maximum, a, b, device);
-  TIME(minimum, a, b, device);
+  TIME(mx::add, a, b, device);
+  TIME(mx::subtract, a, b, device);
+  TIME(mx::multiply, a, b, device);
+  TIME(mx::divide, a, b, device);
+  TIME(mx::maximum, a, b, device);
+  TIME(mx::minimum, a, b, device);
+  TIME(mx::where, condition, a, b, device);

-  b = random::uniform({1});
-  eval(b);
-  TIMEM("scalar", add, a, b, device);
-  TIMEM("vector-scalar", subtract, a, b, device);
-  TIMEM("scalar-vector", subtract, b, a, device);
-  TIMEM("scalar", multiply, a, b, device);
-  TIMEM("vector-scalar", divide, a, b, device);
-  TIMEM("scalar-vector", divide, b, a, device);
+  condition = mx::array({true});
+  b = mx::random::uniform({1});
+  mx::eval(b);
+  TIMEM("scalar", mx::add, a, b, device);
+  TIMEM("vector-scalar", mx::subtract, a, b, device);
+  TIMEM("scalar-vector", mx::subtract, b, a, device);
+  TIMEM("scalar", mx::multiply, a, b, device);
+  TIMEM("vector-scalar", mx::divide, a, b, device);
+  TIMEM("scalar-vector", mx::divide, b, a, device);
+  TIMEM("scalar-vector", mx::where, condition, a, b, device);

-  a = broadcast_to(random::uniform({1}), {1000, 100});
-  b = broadcast_to(random::uniform({1}), {1000, 100});
-  eval(a, b);
-  TIMEM("scalar-scalar broadcast", add, a, b, device);
-  TIMEM("scalar-scalar broadcast", subtract, a, b, device);
-  TIMEM("scalar-scalar broadcast", multiply, a, b, device);
-  TIMEM("scalar-scalar broadcast", divide, a, b, device);
+  condition = mx::broadcast_to(mx::array({true}), {1000, 100});
+  a = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  b = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  mx::eval(a, b);
+  TIMEM("scalar-scalar broadcast", mx::add, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::subtract, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::multiply, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::divide, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::where, condition, a, b, device);
 }

 void time_strided_ops() {
  int M = 50, N = 50, O = 50, P = 50;
-  auto a = random::uniform({M, N, O, P});
-  auto b = random::uniform({M, N, O, P});
-  auto device = default_device();
-  eval(a, b);
-  TIMEM("non-strided", add, a, b, device);
-  a = transpose(a, {1, 0, 2, 3});
-  b = transpose(b, {3, 2, 0, 1});
-  eval(a, b);
-  TIMEM("strided", add, a, b, device);
+  auto a = mx::random::uniform({M, N, O, P});
+  auto b = mx::random::uniform({M, N, O, P});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIMEM("non-strided", mx::add, a, b, device);
+  a = mx::transpose(a, {1, 0, 2, 3});
+  b = mx::transpose(b, {3, 2, 0, 1});
+  mx::eval(a, b);
+  TIMEM("strided", mx::add, a, b, device);
 }

 void time_comparisons() {
  int M = 1000, N = 100, K = 10;
-  auto a = random::uniform({M, N, K});
-  auto b = random::uniform({M, N, K});
-  auto device = default_device();
-  eval(a, b);
-  TIME(equal, a, b, device);
-  TIME(greater, a, b, device);
-  TIME(greater_equal, a, b, device);
-  TIME(less, a, b, device);
-  TIME(less_equal, a, b, device);
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::equal, a, b, device);
+  TIME(mx::greater, a, b, device);
+  TIME(mx::greater_equal, a, b, device);
+  TIME(mx::less, a, b, device);
+  TIME(mx::less_equal, a, b, device);
 }

 void time_matvec() {
  int M = 2000, N = 200;
-  auto a = random::uniform({M, N});
-  auto b = random::uniform({N});
-  auto c = random::uniform({M});
-  eval(a, b, c);
-  auto matvec = [&]() { return matmul(a, b); };
+  auto a = mx::random::uniform({M, N});
+  auto b = mx::random::uniform({N});
+  auto c = mx::random::uniform({M});
+  mx::eval(a, b, c);
+  auto matvec = [&]() { return mx::matmul(a, b); };
  TIME(matvec);

-  auto matvec_transpose = [&]() { return matmul(transpose(a), c); };
+  auto matvec_transpose = [&]() { return mx::matmul(mx::transpose(a), c); };
  TIME(matvec_transpose);
 }

 void time_matmul() {
  int M = 1000, N = 1000, K = 1000;
-  auto a = random::uniform({M, K});
-  auto b = random::uniform({K, N});
-  auto device = default_device();
-  eval(a, b);
-  TIME(matmul, a, b, device);
+  auto a = mx::random::uniform({M, K});
+  auto b = mx::random::uniform({K, N});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::matmul, a, b, device);

-  auto transpose_matmul = [&]() { return matmul(transpose(a), b); };
+  auto transpose_matmul = [&]() { return mx::matmul(mx::transpose(a), b); };
  TIME(transpose_matmul);
 }

 void time_reductions() {
-  auto a = random::normal({10000, 1000});
-  eval(a);
-  auto sum_all = [&a]() { return sum(a, false); };
+  auto a = mx::random::normal({10000, 1000});
+  mx::eval(a);
+  auto sum_all = [&a]() { return mx::sum(a, false); };
  TIME(sum_all);

-  auto sum_along_0 = [&a]() { return sum(a, 0, false); };
+  auto sum_along_0 = [&a]() { return mx::sum(a, 0, false); };
  TIME(sum_along_0);

-  auto sum_along_1 = [&a]() { return sum(a, 1, false); };
+  auto sum_along_1 = [&a]() { return mx::sum(a, 1, false); };
  TIME(sum_along_1);

-  auto prod_all = [&a]() { return prod(a, false); };
+  auto prod_all = [&a]() { return mx::prod(a, false); };
  TIME(prod_all);

-  auto all_true = [&a]() { return all(a, false); };
+  auto all_true = [&a]() { return mx::all(a, false); };
  TIME(all_true);

-  auto all_along_0 = [&a]() { return all(a, 0, false); };
+  auto all_along_0 = [&a]() { return mx::all(a, 0, false); };
  TIME(all_along_0);

-  auto all_along_1 = [&a]() { return all(a, 1, false); };
+  auto all_along_1 = [&a]() { return mx::all(a, 1, false); };
  TIME(all_along_1);

-  auto any_true = [&a]() { return any(a, false); };
+  auto any_true = [&a]() { return mx::any(a, false); };
  TIME(any_true);

-  auto argmin_along_0 = [&a]() { return argmin(a, 0, false); };
+  auto argmin_along_0 = [&a]() { return mx::argmin(a, 0, false); };
  TIME(argmin_along_0);

-  auto argmin_along_1 = [&a]() { return argmin(a, 1, false); };
+  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
 }

 void time_gather_scatter() {
-  auto a = random::normal({1000, 768});
-  eval(a);
-  auto indices = random::randint(0, 1000, {256});
-  eval(indices);
+  auto a = mx::random::normal({1000, 768});
+  mx::eval(a);
+  auto indices = mx::random::randint(0, 1000, {256});
+  mx::eval(indices);

-  auto embedding_lookup = [&a, &indices]() { return take(a, indices, 0); };
+  auto embedding_lookup = [&a, &indices]() { return mx::take(a, indices, 0); };
  TIME(embedding_lookup);

-  indices = random::randint(0, 768 * 1000, {256 * 768});
-  eval(indices);
+  indices = mx::random::randint(0, 768 * 1000, {256 * 768});
+  mx::eval(indices);

-  auto single_element_lookup = [&a, &indices]() { return take(a, indices); };
+  auto single_element_lookup = [&a, &indices]() {
+    return mx::take(a, indices);
+  };
  TIME(single_element_lookup);

-  indices = random::randint(0, 1000, {256});
-  auto updates = random::normal({256, 1, 768});
-  eval(indices, updates);
+  indices = mx::random::randint(0, 1000, {256});
+  auto updates = mx::random::normal({256, 1, 768});
+  mx::eval(indices, updates);

  auto embedding_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -217,10 +225,10 @@ void time_gather_scatter() {
  };
  TIME(embedding_add);

-  a = reshape(a, {-1});
-  indices = random::randint(0, 768 * 1000, {768 * 256});
-  updates = random::normal({256 * 768, 1});
-  eval(a, indices, updates);
+  a = mx::reshape(a, {-1});
+  indices = mx::random::randint(0, 768 * 1000, {768 * 256});
+  updates = mx::random::normal({256 * 768, 1});
+  mx::eval(a, indices, updates);

  auto single_element_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -233,8 +241,22 @@ void time_gather_scatter() {
  TIME(single_element_add);
 }

+void time_divmod() {
+  auto a = mx::random::normal({1000});
+  auto b = mx::random::normal({1000});
+  mx::eval({a, b});
+
+  auto divmod_fused = [&a, &b]() { return mx::divmod(a, b); };
+  TIME(divmod_fused);
+
+  auto divmod_separate = [&a, &b]() {
+    return std::vector<mx::array>{mx::floor_divide(a, b), mx::remainder(a, b)};
+  };
+  TIME(divmod_separate);
+}
+
 int main() {
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_creation_ops();
  time_type_conversions();
  time_unary_ops();
@@ -246,4 +268,5 @@ int main() {
  time_matmul();
  time_reductions();
  time_gather_scatter();
+  time_divmod();
 }
--- a/benchmarks/cpp/time_utils.h
+++ b/benchmarks/cpp/time_utils.h
@@ -17,14 +17,13 @@
            << std::setprecision(5) << time_fn(FUNC, ##__VA_ARGS__) << " msec" \
            << std::endl;

-#define TIMEM(MSG, FUNC, ...)                                                  \
-  std::cout << "Timing "                                                       \
-            << "(" << MSG << ") " << #FUNC << " ... " << std::flush            \
-            << std::setprecision(5) << time_fn(FUNC, ##__VA_ARGS__) << " msec" \
-            << std::endl;
+#define TIMEM(MSG, FUNC, ...)                                      \
+  std::cout << "Timing " << "(" << MSG << ") " << #FUNC << " ... " \
+            << std::flush << std::setprecision(5)                  \
+            << time_fn(FUNC, ##__VA_ARGS__) << " msec" << std::endl;

 template <typename F, typename... Args>
-double time_fn(F fn, Args... args) {
+double time_fn(F fn, Args&&... args) {
  // warmup
  for (int i = 0; i < 5; ++i) {
    eval(fn(std::forward<Args>(args)...));
--- a/benchmarks/numpy/single_ops.py
+++ b/benchmarks/numpy/single_ops.py
@@ -1,7 +1,6 @@
 # Copyright © 2023 Apple Inc.

 import numpy as np
-
 from time_utils import time_fn


--- a/benchmarks/python/batch_matmul_bench.py
+++ b/benchmarks/python/batch_matmul_bench.py
@@ -1,8 +1,8 @@
 # Copyright © 2023 Apple Inc.

 import argparse
-import mlx.core as mx

+import mlx.core as mx
 from time_utils import time_fn

 B = 8
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@@ -1,13 +1,14 @@
 # Copyright © 2023 Apple Inc.

-import numpy as np
 import argparse
-import mlx.core as mx
-import time
-import torch
-import os
 import math
+import os
 import subprocess
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch

 device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
 device_name = device_name.decode("utf-8").strip("\n")
@@ -165,13 +166,13 @@ if __name__ == "__main__":
    dtypes = ("float32", "float16")
    transposes = ("nn", "nt", "tn")
    shapes = (
+        (16, 234, 768, 3072),
+        (1, 64, 64, 25344),
        (16, 1024, 1024, 1024),
        (1, 1024, 1024, 2048),
        (4, 1024, 1024, 4096),
        (4, 1024, 4096, 1024),
        (1, 4096, 4096, 4096),
-        (15, 1023, 1023, 1023),
-        (17, 1025, 1025, 1025),
    )

    for dtype in dtypes:
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@@ -1,14 +1,14 @@
 # Copyright © 2023 Apple Inc.

-import matplotlib.pyplot as plt
-import numpy as np
 import argparse
-import mlx.core as mx
-import time
-import torch
 import os
 import subprocess
+import time

+import matplotlib.pyplot as plt
+import mlx.core as mx
+import numpy as np
+import torch

 results_dir = "./results"

@@ -133,7 +133,7 @@ def get_gbyte_size(in_vec_len, out_vec_len, np_dtype):
    return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3)


-def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, tranpose):
+def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
@@ -164,7 +164,7 @@ def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, tranpose):
    ax.legend()


-def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, tranpose):
+def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
--- a/benchmarks/python/comparative/bench_mlx.py
+++ b/benchmarks/python/comparative/bench_mlx.py
@@ -4,8 +4,10 @@ import argparse
 import math
 import os
 import time
+from functools import partial

 import mlx.core as mx
+import mlx.nn as nn


 def int_or_list(x):
@@ -22,6 +24,16 @@ def none_or_list(x):
        return [int(xi) for xi in x.split(",")]


+def dtype_from_str(x):
+    if x == "":
+        return mx.float32
+    else:
+        dt = getattr(mx, x)
+        if not isinstance(dt, mx.Dtype):
+            raise ValueError(f"{x} is not an mlx dtype")
+        return dt
+
+
 def bench(f, *args):
    for i in range(10):
        f(*args)
@@ -48,6 +60,63 @@ def matmul(x, y):
    mx.eval(ys)


+def _quant_matmul(x, w, s, b, transpose, group_size, bits):
+    ys = []
+    for i in range(10):
+        ys.append(
+            mx.quantized_matmul(
+                x, w, s, b, transpose=transpose, group_size=group_size, bits=bits
+            )
+        )
+    mx.eval(ys)
+
+
+quant_matmul = {
+    "quant_matmul_32_2": partial(_quant_matmul, transpose=False, group_size=32, bits=2),
+    "quant_matmul_32_4": partial(_quant_matmul, transpose=False, group_size=32, bits=4),
+    "quant_matmul_32_8": partial(_quant_matmul, transpose=False, group_size=32, bits=8),
+    "quant_matmul_64_2": partial(_quant_matmul, transpose=False, group_size=64, bits=2),
+    "quant_matmul_64_4": partial(_quant_matmul, transpose=False, group_size=64, bits=4),
+    "quant_matmul_64_8": partial(_quant_matmul, transpose=False, group_size=64, bits=8),
+    "quant_matmul_128_2": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=2
+    ),
+    "quant_matmul_128_4": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=4
+    ),
+    "quant_matmul_128_8": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=8
+    ),
+    "quant_matmul_t_32_2": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=2
+    ),
+    "quant_matmul_t_32_4": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=4
+    ),
+    "quant_matmul_t_32_8": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=8
+    ),
+    "quant_matmul_t_64_2": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=2
+    ),
+    "quant_matmul_t_64_4": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=4
+    ),
+    "quant_matmul_t_64_8": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=8
+    ),
+    "quant_matmul_t_128_2": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=2
+    ),
+    "quant_matmul_t_128_4": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=4
+    ),
+    "quant_matmul_t_128_8": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=8
+    ),
+}
+
+
 def conv1d(x, y):
    ys = []
    for i in range(10):
@@ -75,6 +144,13 @@ def reduction(op, axis, x):
    mx.eval(ys)


+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    mx.eval(z)
+
+
 def softmax(axis, x):
    ys = []
    for i in range(100):
@@ -95,7 +171,77 @@ def softmax_fused(axis, x):
 def relu(x):
    y = x
    for i in range(100):
-        y = mx.maximum(y, 0)
+        y = nn.relu(y)
+    mx.eval(y)
+
+
+def leaky_relu(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.leaky_relu(y)
+    mx.eval(y)
+
+
+def prelu(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.prelu(y, mx.ones(1))
+    mx.eval(y)
+
+
+def softplus(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.softplus(y)
+    mx.eval(y)
+
+
+def mish(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.mish(y)
+    mx.eval(y)
+
+
+def leaky_relu(x):
+    y = x
+    for i in range(100):
+        y = nn.leaky_relu(y)
+    mx.eval(y)
+
+
+def elu(x):
+    y = x
+    for i in range(100):
+        y = nn.elu(y)
+    mx.eval(y)
+
+
+def relu6(x):
+    y = x
+    for i in range(100):
+        y = nn.relu6(y)
+    mx.eval(y)
+
+
+def softplus(x):
+    y = x
+    for i in range(100):
+        y = nn.softplus(y)
+    mx.eval(y)
+
+
+def celu(x):
+    y = x
+    for i in range(100):
+        y = nn.celu(y)
+    mx.eval(y)
+
+
+def log_sigmoid(x):
+    y = x
+    for i in range(100):
+        y = nn.log_sigmoid(y)
    mx.eval(y)


@@ -130,6 +276,13 @@ def linear(w, b, x):
    mx.eval(ys)


+def linear_fused(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.addmm(b, x, mx.transpose(w, (1, 0))))
+    mx.eval(ys)
+
+
 def rope(x):
    *_, N, D = x.shape
    ys = []
@@ -180,6 +333,20 @@ def topk(axis, x):
    mx.eval(ys)


+def step_function(x):
+    y = x
+    for i in range(100):
+        y = nn.step(x)
+    mx.eval(y)
+
+
+def selu(x):
+    y = x
+    for i in range(100):
+        y = nn.selu(x)
+    mx.eval(y)
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("benchmark", help="Choose the benchmark to run")
@@ -211,9 +378,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--fused", action="store_true", help="Use fused functions where possible"
    )
-    parser.add_argument(
-        "--dtype", choices=["float32", "float16", "bfloat16"], default="float32"
-    )
+    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")

    args = parser.parse_args()

@@ -222,19 +387,19 @@ if __name__ == "__main__":
    if len(args.axis) > 1:
        args.axis.pop(0)

-    if args.print_pid:
-        print(os.getpid())
-        input("Press enter to run")
-
    if args.cpu:
        mx.set_default_device(mx.cpu)
    else:
        mx.set_default_device(mx.gpu)
-    dtype = dict(float32=mx.float32, float16=mx.float16, bfloat16=mx.bfloat16)[
-        args.dtype
-    ]
+
+    types = args.dtype
+    if not types:
+        types = [mx.float32]
+    if len(types) < len(args.size):
+        types = types + [types[0]] * (len(args.size) - len(types))
+
    xs = []
-    for size in args.size:
+    for size, dtype in zip(args.size, types):
        xs.append(mx.random.normal(size).astype(dtype))
    for i, t in enumerate(args.transpose):
        if t is None:
@@ -244,14 +409,24 @@ if __name__ == "__main__":
    x = xs[0]
    axis = args.axis[0]

+    if args.print_pid:
+        print(os.getpid())
+        input("Press enter to run")
+
    if args.benchmark == "matmul_square":
        print(bench(matmul_square, x))

    elif args.benchmark == "matmul":
        print(bench(matmul, *xs))

+    elif args.benchmark.startswith("quant_matmul"):
+        print(bench(quant_matmul[args.benchmark], *xs))
+
    elif args.benchmark == "linear":
-        print(bench(linear, *xs))
+        if args.fused:
+            print(bench(linear_fused, *xs))
+        else:
+            print(bench(linear, *xs))

    elif args.benchmark == "sum_axis":
        print(bench(reduction, "sum", axis, x))
@@ -277,6 +452,26 @@ if __name__ == "__main__":
    elif args.benchmark == "relu":
        print(bench(relu, x))

+    elif args.benchmark == "elu":
+        print(bench(elu, x))
+
+    elif args.benchmark == "relu6":
+        print(bench(relu6, x))
+
+    elif args.benchmark == "celu":
+        print(bench(celu, x))
+
+    elif args.benchmark == "log_sigmoid":
+        print(bench(log_sigmoid, x))
+
+    elif args.benchmark == "leaky_relu":
+        print(bench(leaky_relu, x))
+    elif args.benchmark == "prelu":
+        print(bench(prelu, x))
+    elif args.benchmark == "softplus":
+        print(bench(softplus, x))
+    elif args.benchmark == "mish":
+        print(bench(mish, x))
    elif args.benchmark == "scalar_mul":
        print(bench(scalar_mult, x))

@@ -311,5 +506,14 @@ if __name__ == "__main__":
    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

+    elif args.benchmark == "step":
+        print(bench(step_function, x))
+
+    elif args.benchmark == "selu":
+        print(bench(selu, x))
+
+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError("Unknown benchmark")
--- a/benchmarks/python/comparative/bench_torch.py
+++ b/benchmarks/python/comparative/bench_torch.py
@@ -22,6 +22,16 @@ def none_or_list(x):
        return [int(xi) for xi in x.split(",")]


+def dtype_from_str(x):
+    if x == "":
+        return torch.float32
+    else:
+        dt = getattr(torch, x)
+        if not isinstance(dt, torch.dtype):
+            raise ValueError(f"{x} is not a torch dtype")
+        return dt
+
+
 def bench(f, *args):
    for i in range(10):
        f(*args)
@@ -115,6 +125,70 @@ def relu(x):
    sync_if_needed(x)


+@torch.no_grad()
+def leaky_relu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.leaky_relu(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def elu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.elu(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def celu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.celu(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def relu6(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.relu6(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def softplus(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.softplus(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def log_sigmoid(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.logsigmoid(y)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def prelu(x: torch.Tensor) -> torch.Tensor:
+    y = x
+    for _ in range(100):
+        y = torch.nn.functional.prelu(y, torch.ones(1).to(y.device))
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def mish(x: torch.Tensor) -> torch.Tensor:
+    y = x
+    for _ in range(100):
+        y = torch.nn.functional.mish(y)
+    sync_if_needed(x)
+
+
@torch.no_grad()
 def scalar_mult(x):
    y = x
@@ -209,6 +283,22 @@ def topk(axis, x):
    sync_if_needed(x)


+@torch.no_grad()
+def step_function(x):
+    y = x
+    for i in range(100):
+        y = torch.where(y < 0, 0, 1)
+    sync_if_needed(x)
+
+
+@torch.no_grad()
+def selu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.selu(y)
+    sync_if_needed(x)
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("benchmark", help="Choose the benchmark to run")
@@ -240,7 +330,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--fused", action="store_true", help="Use fused functions where possible"
    )
-    parser.add_argument("--dtype", choices=["float32", "float16"], default="float32")
+    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")

    args = parser.parse_args()

@@ -249,15 +339,17 @@ if __name__ == "__main__":
    if len(args.axis) > 1:
        args.axis.pop(0)

-    if args.print_pid:
-        print(os.getpid())
-        input("Press enter to run")
-
    torch.set_num_threads(1)
    device = "cpu" if args.cpu else "mps"
-    dtype = dict(float32=torch.float32, float16=torch.float16)[args.dtype]
+
+    types = args.dtype
+    if not types:
+        types = [torch.float32]
+    if len(types) < len(args.size):
+        types = types + [types[0]] * (len(args.size) - len(types))
+
    xs = []
-    for size in args.size:
+    for size, dtype in zip(args.size, types):
        xs.append(torch.randn(*size).to(device).to(dtype))
    for i, t in enumerate(args.transpose):
        if t is None:
@@ -266,6 +358,10 @@ if __name__ == "__main__":
    x = xs[0]
    axis = args.axis[0]

+    if args.print_pid:
+        print(os.getpid())
+        input("Press enter to run")
+
    if args.benchmark == "matmul_square":
        print(bench(matmul_square, x))

@@ -302,6 +398,28 @@ if __name__ == "__main__":
    elif args.benchmark == "relu":
        print(bench(relu, x))

+    elif args.benchmark == "leaky_relu":
+        print(bench(leaky_relu, x))
+
+    elif args.benchmark == "elu":
+        print(bench(elu, x))
+
+    elif args.benchmark == "relu6":
+        print(bench(relu6, x))
+
+    elif args.benchmark == "softplus":
+        print(bench(softplus, x))
+
+    elif args.benchmark == "celu":
+        print(bench(celu, x))
+
+    elif args.benchmark == "log_sigmoid":
+        print(bench(log_sigmoid, x))
+
+    elif args.benchmark == "prelu":
+        print(bench(prelu, x))
+    elif args.benchmark == "mish":
+        print(bench(mish, x))
    elif args.benchmark == "scalar_mul":
        print(bench(scalar_mult, x))

@@ -336,5 +454,11 @@ if __name__ == "__main__":
    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

+    elif args.benchmark == "step":
+        print(bench(step_function, x))
+
+    elif args.benchmark == "selu":
+        print(bench(selu, x))
+
    else:
-        raise ValueError("Unknown benchmark")
+        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -16,7 +16,9 @@ def run_or_raise(*args, **kwargs):
        result = run(*args, capture_output=True, **kwargs)
        return float(result.stdout)
    except ValueError:
-        raise ValueError(f"stdout: {result.stdout}\nstderr: {result.stderr}")
+        raise ValueError(
+            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
+        )


 def compare(args):
@@ -62,7 +64,7 @@ def make_predicate(positive_filter, negative_filter):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run comparisons agains PyTorch")
+    parser = argparse.ArgumentParser(description="Run comparisons against PyTorch")
    parser.add_argument(
        "--filter", "-f", help="Regex filter to select benchmarks", nargs="+"
    )
@@ -80,10 +82,8 @@ if __name__ == "__main__":
    _filter = make_predicate(args.filter, args.negative_filter)

    if args.mlx_dtypes:
-        compare_filtered = (
-            lambda x: compare_mlx_dtypes(
-                x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1]
-            )
+        compare_filtered = lambda x: (
+            compare_mlx_dtypes(x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1])
            if _filter(x)
            else None
        )
@@ -125,6 +125,14 @@ if __name__ == "__main__":
    compare_filtered("sum_axis --size 16x128x1024 --axis 1")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1")
    compare_filtered("argmax --size 10x1024x128 --axis 1 --cpu")
    compare_filtered("argmax --size 10x1024x128 --axis 1")
    compare_filtered("argmax --size 10x1024x128 --axis 2 --cpu")
@@ -193,6 +201,27 @@ if __name__ == "__main__":
    compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused --cpu")
    compare_filtered("relu --size 32x16x1024")
    compare_filtered("relu --size 32x16x1024 --cpu")
+    compare_filtered("leaky_relu --size 32x16x1024")
+    compare_filtered("leaky_relu --size 32x16x1024 --cpu")
+    compare_filtered("elu --size 32x16x1024")
+    compare_filtered("elu --size 32x16x1024 --cpu")
+    compare_filtered("relu6 --size 32x16x1024")
+    compare_filtered("relu6 --size 32x16x1024 --cpu")
+    compare_filtered("softplus --size 32x16x1024")
+    compare_filtered("softplus --size 32x16x1024 --cpu")
+    compare_filtered("celu --size 32x16x1024")
+    compare_filtered("celu --size 32x16x1024 --cpu")
+    compare_filtered("log_sigmoid --size 32x16x1024")
+    compare_filtered("log_sigmoid --size 32x16x1024 --cpu")
+    compare_filtered("step --size 32x16x1024")
+    compare_filtered("step --size 32x16x1024 --cpu")
+    compare_filtered("selu --size 32x16x1024")
+    compare_filtered("selu --size 32x16x1024 --cpu")
+    # compare_filtered("mish --size 32x16x1024") NOTE: Torch does not implement Mish in MPS atm
+    compare_filtered("mish --size 32x16x1024 --cpu")
+    compare_filtered("prelu --size 32x16x1024")
+    compare_filtered("prelu --size 32x16x1024 --cpu")
+
    compare_filtered("scalar_mul --size 32x16x1024")
    compare_filtered("scalar_mul --size 32x16x1024 --cpu")
    compare_filtered("cross_entropy --size 256x1024")
--- a/benchmarks/python/compile_bench.py
+++ b/benchmarks/python/compile_bench.py
@@ -0,0 +1,107 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import argparse
+import math
+import random
+
+import mlx.core as mx
+from time_utils import time_fn
+
+
+def bench_gelu():
+    def gelu(x):
+        return x * (1 + mx.erf(x / math.sqrt(2))) / 2
+
+    x = mx.random.uniform(shape=(1000, 1024))
+
+    def gen_fun(fun):
+        def bench_fun(x):
+            for _ in range(10):
+                x = fun(x)
+            return x
+
+        return bench_fun
+
+    time_fn(gen_fun(gelu), x, msg="fixed gelu")
+    time_fn(gen_fun(mx.compile(gelu)), x, msg="compiled fixed gelu")
+
+    def randint():
+        return random.randint(1, x.shape[0])
+
+    def gen_fun(fun):
+        def bench_fun(x, y):
+            x = x[: randint()]
+            for _ in range(10):
+                x = fun(x)
+                y = fun(y)
+            return x, y
+
+        return bench_fun
+
+    y = mx.random.uniform(shape=(1000, 1024))
+    time_fn(gen_fun(gelu), x, y, msg="variable gelu")
+    time_fn(gen_fun(mx.compile(gelu)), x, y, msg="compiled variable gelu")
+    time_fn(
+        gen_fun(mx.compile(gelu, shapeless=True)),
+        x,
+        y,
+        msg="shapeless variable gelu",
+    )
+
+
+def bench_layernorm():
+    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    mx.eval(weight, bias)
+
+    def layernorm(x):
+        x = x.astype(mx.float32)
+        means = mx.mean(x, axis=-1, keepdims=True)
+        var = mx.var(x, axis=-1, keepdims=True)
+        x = (x - means) * mx.rsqrt(var + 1e-4)
+        x = x.astype(mx.float16)
+        return weight * x + bias
+
+    x = mx.random.uniform(shape=(1000, 4096)).astype(mx.float16)
+
+    def gen_fun(fun):
+        def bench_fun(x):
+            for _ in range(10):
+                x = fun(x)
+            return x
+
+        return bench_fun
+
+    time_fn(gen_fun(layernorm), x, msg="fixed layernorm")
+    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled fixed layernorm")
+
+    def randint():
+        return random.randint(1, x.shape[0])
+
+    def gen_fun(fun):
+        def bench_fun(x):
+            x = x[: randint()]
+            for _ in range(10):
+                x = fun(x)
+            return x
+
+        return bench_fun
+
+    random.seed(0)
+    time_fn(gen_fun(layernorm), x, msg="variable layernorm")
+    random.seed(0)
+    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled variable layernorm")
+    random.seed(0)
+    time_fn(
+        gen_fun(mx.compile(layernorm, shapeless=True)),
+        x,
+        msg="shapeless variable layernorm",
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Compile benchmarks.")
+    args = parser.parse_args()
+
+    bench_gelu()
+    bench_layernorm()
--- a/benchmarks/python/conv1d_bench.py
+++ b/benchmarks/python/conv1d_bench.py
@@ -0,0 +1,123 @@
+import argparse
+import math
+import os
+import subprocess
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_1D(strides=1, padding=0, groups=1):
+    def mx_conv_1D(a, b):
+        ys = []
+        for _ in range(N_iter_func):
+            y = mx.conv1d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_1D
+
+
+def make_pt_conv_1D(strides=1, padding=0, groups=1):
+    @torch.no_grad()
+    def pt_conv_1D(a, b):
+        ys = []
+        for _ in range(N_iter_func):
+            y = torch.conv1d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_1D
+
+
+def bench_shape(N, iH, C, wH, O, strides, padding, np_dtype, groups):
+    scale = 1.0 / math.sqrt(wH * C)
+    a_np = np.random.uniform(0, 0.5, (N, iH, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, wH, int(C / groups))).astype(np_dtype)
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 2, 1))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 2, 1))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_1D(strides, padding, groups)
+    f_pt = make_pt_conv_1D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv1d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv1d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, iH, C)}, {(O, wH, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 5, 32, 1, 2, 1),
+        (4, 32, 32, 5, 32, 1, 2, 2),
+        (4, 32, 32, 5, 32, 1, 2, 4),
+        (4, 32, 32, 5, 32, 1, 2, 8),
+        (4, 32, 32, 5, 32, 1, 2, 8),
+        (4, 32, 32, 5, 32, 1, 2, 16),
+        (4, 32, 32, 5, 32, 1, 2, 32),
+        (4, 32, 256, 5, 512, 1, 2, 2),
+        (4, 32, 256, 5, 512, 1, 2, 128),
+        (4, 32, 256, 5, 512, 1, 2, 256),
+    )
+
+    for dtype in dtypes:
+        print("(N,  iH,  C),  (O,  wH,  C),   dtype,  stride, pads, groups, diff%")
+        for N, iH, C, wH, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, iH, C, wH, O, strides, padding, np_dtype, groups
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {iH:3d}, {C:3d}), ({O:3d}, {wH:2d}, {C:3d}), {dtype}, {strides:5d}, {padding:4d}, {groups:6d}, {100. * diff:+5.2f}%"
+            )
+
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv2d_bench_cpu.py
+++ b/benchmarks/python/conv2d_bench_cpu.py
@@ -0,0 +1,127 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("cpu")
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv2d_train_bench_cpu.py
+++ b/benchmarks/python/conv2d_train_bench_cpu.py
@@ -0,0 +1,143 @@
+import time
+
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+
+
+def bench_mlx(steps: int = 20) -> float:
+    mx.set_default_device(mx.cpu)
+
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def __call__(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+
+    inputs = mx.random.normal([10, 256, 256, 3])
+
+    params = benchNet.parameters()
+    optim.init(params)
+
+    state = [benchNet.state, optim.state]
+
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def bench_torch(steps: int = 20) -> float:
+    device = torch.device("cpu")
+
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def forward(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+
+    inputs = torch.randn(10, 3, 256, 256, device=device)
+
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def main():
+    steps = 20
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/python/conv2d_transpose_bench_cpu.py
+++ b/benchmarks/python/conv2d_transpose_bench_cpu.py
@@ -0,0 +1,129 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_transpose_2D
+
+
+def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+
+    return pt_conv_transpose_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (int(O / groups), kH, kW, C)).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("cpu")
+
+    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
+    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv_transpose2d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+    )
+    out_pt = torch.conv_transpose2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv3d_bench_cpu.py
+++ b/benchmarks/python/conv3d_bench_cpu.py
@@ -0,0 +1,110 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_3D
+
+
+def make_pt_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+
+    return pt_conv_3D
+
+
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv3d_train_bench_cpu.py
+++ b/benchmarks/python/conv3d_train_bench_cpu.py
@@ -0,0 +1,143 @@
+import time
+
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+
+
+def bench_mlx(steps: int = 20, shape=(10, 32, 32, 32, 3)) -> float:
+    mx.set_default_device(mx.cpu)
+
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def __call__(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+
+    inputs = mx.random.normal(shape)
+
+    params = benchNet.parameters()
+    optim.init(params)
+
+    state = [benchNet.state, optim.state]
+
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def bench_torch(steps: int = 20, shape=(10, 3, 32, 32, 32)) -> float:
+    device = torch.device("cpu")
+
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def forward(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+
+    inputs = torch.randn(*shape, device=device)
+
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def main():
+    steps = 10
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/python/conv3d_transpose_bench_cpu.py
+++ b/benchmarks/python/conv3d_transpose_bench_cpu.py
@@ -0,0 +1,116 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_3D
+
+
+def make_pt_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+
+    return pt_conv_3D
+
+
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((4, 0, 1, 2, 3))).to("cpu")
+
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv_transpose3d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.conv_transpose3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv_bench.py
+++ b/benchmarks/python/conv_bench.py
@@ -0,0 +1,135 @@
+import argparse
+import math
+import os
+import subprocess
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv_transpose_bench.py
+++ b/benchmarks/python/conv_transpose_bench.py
@@ -0,0 +1,135 @@
+import argparse
+import math
+import os
+import subprocess
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_transpose_2D
+
+
+def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+
+    return pt_conv_transpose_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
+    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv_transpose2d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.conv_transpose2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/distributed_bench.py
+++ b/benchmarks/python/distributed_bench.py
@@ -0,0 +1,66 @@
+# Copyright © 2024 Apple Inc.
+
+"""
+Run with:
+    mpirun -n 2 python /path/to/distributed_bench.py
+"""
+
+import time
+
+import mlx.core as mx
+
+
+def time_fn(fn, *args, **kwargs):
+    msg = kwargs.pop("msg", None)
+    world = mx.distributed.init()
+    if world.rank() == 0:
+        if msg:
+            print(f"Timing {msg} ...", end=" ")
+        else:
+            print(f"Timing {fn.__name__} ...", end=" ")
+
+    # warmup
+    for _ in range(5):
+        mx.eval(fn(*args, **kwargs))
+
+    num_iters = 100
+    tic = time.perf_counter()
+    for _ in range(num_iters):
+        x = mx.eval(fn(*args, **kwargs))
+    toc = time.perf_counter()
+
+    msec = 1e3 * (toc - tic) / num_iters
+    if world.rank() == 0:
+        print(f"{msec:.5f} msec")
+
+
+def time_all_sum():
+    shape = (4096,)
+    x = mx.random.uniform(shape=shape)
+    mx.eval(x)
+
+    def sine(x):
+        for _ in range(20):
+            x = mx.sin(x)
+        return x
+
+    time_fn(sine, x)
+
+    def all_sum_plain(x):
+        for _ in range(20):
+            x = mx.distributed.all_sum(x)
+        return x
+
+    time_fn(all_sum_plain, x)
+
+    def all_sum_with_sine(x):
+        for _ in range(20):
+            x = mx.sin(x)
+            x = mx.distributed.all_sum(x)
+        return x
+
+    time_fn(all_sum_with_sine, x)
+
+
+if __name__ == "__main__":
+    time_all_sum()
--- a/benchmarks/python/einsum_bench.py
+++ b/benchmarks/python/einsum_bench.py
@@ -0,0 +1,84 @@
+# Copyright © 2024 Apple Inc.
+
+import time
+
+import mlx.core as mx
+import numpy as np
+
+
+def timeit(fn, its=100, args=[]):
+    for _ in range(5):
+        fn(*args)
+    tic = time.perf_counter()
+    for _ in range(its):
+        fn(*args)
+    toc = time.perf_counter()
+    return 1e3 * (toc - tic) / its
+
+
+def time_little_einsum_path():
+    subscripts = "ik,kj->ij"
+    x = mx.ones((32, 32))
+    y = mx.ones((32, 32))
+    mx_time = timeit(mx.einsum_path, args=(subscripts, x, y))
+
+    x = np.array(x)
+    y = np.array(y)
+    np_time = timeit(np.einsum_path, args=(subscripts, x, y))
+    print("Timing little einsum path...")
+    print(f"MLX ... {mx_time:.3f} ms")
+    print(f"NumPy... {np_time:.3f} ms")
+
+
+def time_big_einsum_path():
+    chars = list("abcdefgh")
+    char_to_dim = {c: v for v, c in enumerate(chars)}
+
+    num_inputs = 10
+    inputs = []
+    subscripts = []
+    for _ in range(num_inputs):
+        subscript = np.random.choice(chars, size=5, replace=False).tolist()
+        subscripts.append("".join(subscript))
+        inputs.append(np.ones(list(char_to_dim[c] for c in subscript)))
+    subscripts = ",".join(subscripts)
+
+    np_time = timeit(np.einsum_path, args=(subscripts, *inputs))
+
+    inputs = [mx.array(x) for x in inputs]
+    mx_time = timeit(mx.einsum_path, args=(subscripts, *inputs))
+    print("Timing big einsum path...")
+    print(f"MLX ... {mx_time:.3f} ms")
+    print(f"NumPy... {np_time:.3f} ms")
+
+
+def time_attention():
+    def regular_attention(x):
+        # shape [batch, sequence, num_heads, head_dim]
+        queries, keys, values = x, x, x
+        scores = queries.transpose(0, 2, 1, 3) @ keys.transpose(0, 2, 3, 1)
+        scores = mx.softmax(scores, axis=-1)
+        output = (scores @ values.transpose(0, 2, 1, 3)).swapaxes(1, 2)
+        mx.eval(output)
+
+    def einsum_attention(x):
+        # shape [batch, sequence, num_heads, head_dim]
+        queries, keys, values = x, x, x
+        scores = mx.einsum("itjk,iujk->ijtu", queries, keys)
+        scores = mx.softmax(scores, axis=-1)
+        output = mx.einsum("ijtu,iujk->itjk", scores, values)
+        mx.eval(output)
+
+    x = mx.random.uniform(shape=(8, 512, 32, 128))
+
+    regular_time = timeit(regular_attention, args=(x,))
+    ein_time = timeit(einsum_attention, args=(x,))
+    print("Timing einsum attention...")
+    print(f"Regular ... {regular_time:.3f} ms")
+    print(f"Einsum ... {ein_time:.3f} ms")
+
+
+if __name__ == "__main__":
+    time_little_einsum_path()
+    time_big_einsum_path()
+    time_attention()
--- a/benchmarks/python/fft_bench.py
+++ b/benchmarks/python/fft_bench.py
@@ -0,0 +1,118 @@
+# Copyright © 2024 Apple Inc.
+
+import matplotlib
+import mlx.core as mx
+import numpy as np
+import sympy
+import torch
+from time_utils import measure_runtime
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def bandwidth_gb(runtime_ms, system_size):
+    bytes_per_fft = np.dtype(np.complex64).itemsize * 2
+    bytes_per_gb = 1e9
+    ms_per_s = 1e3
+    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb
+
+
+def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
+    def fft_mlx(x):
+        if dim == 1:
+            out = mx.fft.fft(x)
+        elif dim == 2:
+            out = mx.fft.fft2(x)
+        mx.eval(out)
+        return out
+
+    def fft_mps(x):
+        if dim == 1:
+            out = torch.fft.fft(x)
+        elif dim == 2:
+            out = torch.fft.fft2(x)
+        torch.mps.synchronize()
+        return out
+
+    bandwidths = []
+    for n in fft_sizes:
+        batch_size = system_size // n**dim
+        shape = [batch_size] + [n for _ in range(dim)]
+        if backend == "mlx":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = mx.array(x_np)
+            mx.eval(x)
+            fft = fft_mlx
+        elif backend == "mps":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = torch.tensor(x_np, device="mps")
+            torch.mps.synchronize()
+            fft = fft_mps
+        else:
+            raise NotImplementedError()
+        runtime_ms = measure_runtime(fft, x=x)
+        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
+        print(n, bandwidth)
+        bandwidths.append(bandwidth)
+
+    return np.array(bandwidths)
+
+
+def time_fft():
+    x = np.array(range(2, 512))
+    system_size = int(2**26)
+
+    print("MLX GPU")
+    with mx.stream(mx.gpu):
+        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+
+    print("MPS GPU")
+    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")
+
+    print("CPU")
+    system_size = int(2**20)
+    with mx.stream(mx.cpu):
+        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+
+    x = np.array(x)
+
+    all_indices = x - x[0]
+    radix_2to13 = (
+        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+    bluesteins = (
+        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+
+    for indices, name in [
+        (all_indices, "All"),
+        (radix_2to13, "Radix 2-13"),
+        (bluesteins, "Bluestein's"),
+    ]:
+        # plot bandwidths
+        print(name)
+        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
+        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
+        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
+        plt.title(f"MLX FFT Benchmark -- {name}")
+        plt.xlabel("N")
+        plt.ylabel("Bandwidth (GB/s)")
+        plt.legend()
+        plt.savefig(f"{name}.png")
+        plt.clf()
+
+    av_gpu_bandwidth = np.mean(gpu_bandwidths)
+    av_mps_bandwidth = np.mean(mps_bandwidths)
+    av_cpu_bandwidth = np.mean(cpu_bandwidths)
+    print("Average bandwidths:")
+    print("GPU:", av_gpu_bandwidth)
+    print("MPS:", av_mps_bandwidth)
+    print("CPU:", av_cpu_bandwidth)
+
+    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
+    print("Percent MLX faster than MPS: ", portion_faster * 100)
+
+
+if __name__ == "__main__":
+    time_fft()
--- a/benchmarks/python/gather_bench.py
+++ b/benchmarks/python/gather_bench.py
@@ -0,0 +1,53 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import argparse
+from time import time
+
+import mlx.core as mx
+import torch
+from time_utils import measure_runtime
+
+
+def benchmark_gather_mlx(x_shape, idx_shape):
+    def gather(x, idx):
+        mx.eval(x[idx])
+
+    idx = mx.random.randint(0, x_shape[0] - 1, idx_shape)
+    x = mx.random.normal(x_shape).astype(mx.float32)
+
+    runtime = measure_runtime(gather, x=x, idx=idx)
+    print(f"MLX: {runtime:.3f}ms")
+
+
+def benchmark_gather_torch(x_shape, idx_shape, device):
+    def gather(x, idx, device):
+        _ = x[idx]
+        if device == torch.device("mps"):
+            torch.mps.synchronize()
+
+    idx = torch.randint(0, x_shape[0] - 1, idx_shape).to(device)
+    x = torch.randn(x_shape, dtype=torch.float32).to(device)
+
+    runtime = measure_runtime(gather, x=x, idx=idx, device=device)
+    print(f"PyTorch: {runtime:.3f}ms")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Gather benchmarks.")
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
+    args = parser.parse_args()
+
+    if args.cpu:
+        mx.set_default_device(mx.cpu)
+        device = torch.device("cpu")
+    else:
+        device = torch.device("mps")
+
+    idx_shapes = [(1_000_000,), (100_000,), ()]
+    x_shapes = [(100, 64), (100, 1024), (4, 1_000_000)]
+
+    for x_shape, idx_shape in zip(x_shapes, idx_shapes):
+        print("=" * 20)
+        print(f"X {x_shape}, Indices {idx_shape}")
+        benchmark_gather_mlx(x_shape, idx_shape)
+        benchmark_gather_torch(x_shape, idx_shape, device=device)
--- a/benchmarks/python/hadamard_bench.py
+++ b/benchmarks/python/hadamard_bench.py
@@ -0,0 +1,70 @@
+import argparse
+
+import matplotlib
+import mlx.core as mx
+import numpy as np
+from time_utils import measure_runtime
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def had(x):
+    y = mx.hadamard_transform(x)
+    mx.eval(y)
+
+
+def copy(x):
+    y = x + 1.0
+    mx.eval(y)
+
+
+def run(dtype):
+    system_size = 2**26
+    outputs = {}
+    for test_fn in (had, copy):
+        for m in [1, 12, 20, 28]:
+            if test_fn == copy:
+                key = "copy"
+            elif m == 1:
+                key = "had_2^k"
+            else:
+                key = "had_m*2^k"
+            outputs.setdefault(key, {})
+            for k in range(7, 14):
+                n = m * 2**k
+                if n > 2**15:
+                    continue
+                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
+                x = mx.array(x_np)
+                runtime_ms = measure_runtime(test_fn, x=x)
+                bytes_per_gb = 1e9
+                ms_per_s = 1e3
+                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
+                bandwidth_gb = (
+                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
+                )
+                print(n, bandwidth_gb)
+                outputs[key][n] = bandwidth_gb
+
+    colors = {
+        "copy": "black",
+        "had_2^k": "steelblue",
+        "had_m*2^k": "skyblue",
+    }
+    for key, output in outputs.items():
+        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
+    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
+    plt.xlabel("N")
+    plt.ylabel("Bandwidth (GB/s)")
+    plt.legend()
+    plt.savefig(f"bench_{dtype.__name__}.png")
+    plt.clf()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fp16", action="store_true")
+    args = parser.parse_args()
+    dtype = np.float16 if args.fp16 else np.float32
+    run(dtype)
--- a/benchmarks/python/layer_norm_bench.py
+++ b/benchmarks/python/layer_norm_bench.py
@@ -0,0 +1,41 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import mlx.core as mx
+import mlx.nn as nn
+from time_utils import time_fn
+
+
+def layer_norm(x, w, b, eps):
+    ot = x.dtype
+    x = x.astype(mx.float32)
+    mu = mx.mean(x, -1, keepdims=True)
+    v = mx.var(x, -1, keepdims=True)
+    return (x - mu) * mx.rsqrt(v + eps) * w + b
+
+
+def time_layer_norm():
+    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
+    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
+    g1 = mx.grad(f1, argnums=(0, 1, 2))
+    g2 = mx.grad(f2, argnums=(0, 1, 2))
+
+    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    b = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    mx.eval(x, w, b, y)
+
+    def layer_norm_loop(g, x, w, b):
+        gx, gw, gb = x, w, b
+        for _ in range(32):
+            gx, gw, gb = g(gx, gw, gb, y)
+        return gx, gw, gb
+
+    time_fn(layer_norm_loop, g1, x, w, b)
+    time_fn(layer_norm_loop, g2, x, w, b)
+    time_fn(layer_norm_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_loop, mx.compile(g2), x, w, b)
+
+
+if __name__ == "__main__":
+    time_layer_norm()
--- a/benchmarks/python/llama_jax_bench.py
+++ b/benchmarks/python/llama_jax_bench.py
@@ -1,198 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import jax
-import jax.numpy as jnp
-from flax import linen as nn
-
-
-class RoPE(nn.Module):
-    dims: int
-    traditional: bool = False
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = jnp.concatenate([rx1, rx2, x[..., self.dims :]], axis=-1)
-        else:
-            rx = jnp.concatenate([rx1, rx2], axis=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = jnp.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
-
-        return rx
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        dtype=jnp.float32,
-    ):
-        D = D // 2
-        positions = jnp.arange(offset, N, dtype=dtype)
-        freqs = jnp.exp(-jnp.arange(0, D, dtype=dtype) * (math.log(base) / D))
-        theta = positions.reshape((-1, 1)) * freqs.reshape((1, -1))
-        costheta = jnp.cos(theta)
-        sintheta = jnp.sin(theta)
-
-        return costheta, sintheta
-
-    @nn.compact
-    def __call__(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.reshape((-1, shape[-2], shape[-1]))
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.reshape(shape)
-
-
-class LlamaAttention(nn.Module):
-    dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        num_heads = self.num_heads
-        dims = self.dims
-
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.key_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.value_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-        self.out_proj = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        keys = keys.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-        values = values.reshape((B, L, num_heads, -1)).transpose((0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = jnp.concatenate([key_cache, keys], axis=2)
-            values = jnp.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.transpose((0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = jax.nn.softmax(scores, axis=-1)
-        values_hat = (scores @ values).transpose((0, 2, 1, 3)).reshape((B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    dims: int
-    mlp_dims: int
-    num_heads: int
-    dtype: jnp.dtype
-
-    def setup(self):
-        dims = self.dims
-        mlp_dims = self.mlp_dims
-        num_heads = self.num_heads
-
-        self.attention = LlamaAttention(dims, num_heads, dtype)
-
-        self.norm1 = nn.RMSNorm(param_dtype=self.dtype)
-        self.norm2 = nn.RMSNorm(param_dtype=self.dtype)
-
-        self.linear1 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear2 = nn.Dense(mlp_dims, use_bias=False, param_dtype=self.dtype)
-        self.linear3 = nn.Dense(dims, use_bias=False, param_dtype=self.dtype)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = jax.nn.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        jax.block_until_ready((y, c))
-
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    dtype = jnp.float16
-
-    k1, k2, k3, k4 = jax.random.split(jax.random.PRNGKey(0), 4)
-
-    x = jax.random.normal(k1, (1, 1, D), dtype)
-    cache = [
-        jax.random.normal(k2, [1, H, C, D // H], dtype),
-        jax.random.normal(k3, [1, H, C, D // H], dtype),
-    ]
-
-    layer = LlamaEncoderLayer(D, F, H, dtype=dtype)
-    params = layer.init(k4, x, mask=None, cache=cache)["params"]
-
-    @jax.jit
-    def model_fn(x, mask, cache):
-        return layer.apply({"params": params}, x, mask=mask, cache=cache)
-
-    T = measure(model_fn, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_mlx_bench.py
+++ b/benchmarks/python/llama_mlx_bench.py
@@ -1,118 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.utils
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = nn.RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, False)
-        self.key_proj = nn.Linear(dims, dims, False)
-        self.value_proj = nn.Linear(dims, dims, False)
-        self.out_proj = nn.Linear(dims, dims, False)
-
-    def __call__(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = mx.transpose(mx.reshape(queries, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        keys = mx.transpose(mx.reshape(keys, (B, L, num_heads, -1)), (0, 2, 1, 3))
-        values = mx.transpose(mx.reshape(values, (B, L, num_heads, -1)), (0, 2, 1, 3))
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = mx.concatenate([key_cache, keys], axis=2)
-            values = mx.concatenate([value_cache, values], axis=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = mx.array(math.sqrt(1 / queries.shape[-1]), dtype=queries.dtype)
-        scores = (queries * scale) @ mx.transpose(keys, (0, 1, 3, 2))
-        if mask is not None:
-            scores = scores + mask
-        scores = mx.softmax(scores, axis=-1)
-        values_hat = mx.reshape(mx.transpose(scores @ values, (0, 2, 1, 3)), (B, L, -1))
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = nn.RMSNorm(dims)
-        self.norm2 = nn.RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, False)
-        self.linear2 = nn.Linear(dims, mlp_dims, False)
-        self.linear3 = nn.Linear(mlp_dims, dims, False)
-
-    def __call__(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = a * mx.sigmoid(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        mx.eval(y, c)
-
-    start = time.time()
-    rs = []
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-        rs.append((y, c))
-    mx.eval(rs)
-    end = time.time()
-
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    mx.set_default_device(mx.gpu)
-    dtype = mx.float16
-
-    layer = LlamaEncoderLayer(D, F, H)
-    layer.update(mlx.utils.tree_map(lambda x: x.astype(dtype), layer.parameters()))
-    k1, k2, k3 = mx.random.split(mx.random.key(0), 3)
-    x = mx.random.normal([1, 1, D], dtype=dtype)
-    cache = [
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-        mx.random.normal([1, H, C, D // H], dtype=dtype),
-    ]
-    mx.eval(x, cache)
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/llama_torch_bench.py
+++ b/benchmarks/python/llama_torch_bench.py
@@ -1,199 +0,0 @@
-# Copyright © 2023 Apple Inc.
-
-import math
-import time
-
-import torch
-import torch.nn as nn
-import torch.mps
-
-
-def sync_if_needed(x):
-    if x.device != torch.device("cpu"):
-        torch.mps.synchronize()
-
-
-class RoPE(nn.Module):
-    def __init__(self, dims: int, traditional: bool = False):
-        super().__init__()
-        self.dims = dims
-        self.traditional = traditional
-
-    def _compute_rope(self, costheta, sintheta, x):
-        x1 = x[..., : self.dims // 2]
-        x2 = x[..., self.dims // 2 : self.dims]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            rx = torch.cat([rx1, rx2, x[..., self.dims :]], dim=-1)
-        else:
-            rx = torch.cat([rx1, rx2], dim=-1)
-
-        return rx
-
-    def _compute_traditional_rope(self, costheta, sintheta, x):
-        x1 = x[..., ::2]
-        x2 = x[..., 1::2]
-        rx1 = x1 * costheta - x2 * sintheta
-        rx2 = x1 * sintheta + x2 * costheta
-
-        if self.dims < x.shape[-1]:
-            raise NotImplementedError(
-                "RoPE doesn't implement partial traditional application"
-            )
-
-        rx = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
-
-        return rx
-
-    def forward(self, x, offset: int = 0):
-        shape = x.shape
-        x = x.view(-1, shape[-2], shape[-1])
-        N = x.shape[1] + offset
-        costheta, sintheta = RoPE.create_cos_sin_theta(
-            N, self.dims, offset=offset, device=x.device, dtype=x.dtype
-        )
-
-        rope = (
-            self._compute_traditional_rope if self.traditional else self._compute_rope
-        )
-        rx = rope(costheta, sintheta, x)
-
-        return rx.view(*shape)
-
-    @staticmethod
-    def create_cos_sin_theta(
-        N: int,
-        D: int,
-        offset: int = 0,
-        base: float = 10000,
-        device="cpu",
-        dtype=torch.float32,
-    ):
-        D = D // 2
-        positions = torch.arange(offset, N, dtype=dtype, device=device)
-        freqs = torch.exp(
-            -torch.arange(0, D, dtype=dtype, device=device) * (math.log(base) / D)
-        )
-        theta = positions.view(-1, 1) * freqs.view(1, -1)
-        costheta = torch.cos(theta)
-        sintheta = torch.sin(theta)
-
-        return costheta, sintheta
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dims: int, epsilon: float = 1e-6):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones((dims,)))
-        self.epsilon = epsilon
-
-    def forward(self, x):
-        n = torch.rsqrt(x.square().mean(dim=-1, keepdims=True) + self.epsilon)
-        return self.gamma * x * n
-
-
-class LlamaAttention(nn.Module):
-    def __init__(self, dims: int, num_heads: int):
-        super().__init__()
-        self.num_heads = num_heads
-        self.rope = RoPE(dims // num_heads, True)
-        self.query_proj = nn.Linear(dims, dims, bias=False)
-        self.key_proj = nn.Linear(dims, dims, bias=False)
-        self.value_proj = nn.Linear(dims, dims, bias=False)
-        self.out_proj = nn.Linear(dims, dims, bias=False)
-
-    def forward(self, queries, keys, values, mask=None, cache=None):
-        queries = self.query_proj(queries)
-        keys = self.key_proj(keys)
-        values = self.value_proj(values)
-
-        num_heads = self.num_heads
-        B, L, D = queries.shape
-        queries = queries.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        keys = keys.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-        values = values.view(B, L, num_heads, -1).permute(0, 2, 1, 3)
-
-        if cache is not None:
-            key_cache, value_cache = cache
-            queries = self.rope(queries, offset=key_cache.shape[2])
-            keys = self.rope(keys, offset=key_cache.shape[2])
-            keys = torch.cat([key_cache, keys], dim=2)
-            values = torch.cat([value_cache, values], dim=2)
-        else:
-            queries = self.rope(queries)
-            keys = self.rope(keys)
-
-        # Dimensions are [batch x num heads x sequence x hidden dim]
-        scale = math.sqrt(1 / queries.shape[-1])
-        scores = (queries * scale) @ keys.permute(0, 1, 3, 2)
-        if mask is not None:
-            scores = scores + mask
-        scores = torch.softmax(scores, dim=-1)
-        values_hat = (scores @ values).permute(0, 2, 1, 3).reshape(B, L, -1)
-
-        return self.out_proj(values_hat), (keys, values)
-
-
-class LlamaEncoderLayer(nn.Module):
-    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
-        super().__init__()
-
-        self.attention = LlamaAttention(dims, num_heads)
-
-        self.norm1 = RMSNorm(dims)
-        self.norm2 = RMSNorm(dims)
-
-        self.linear1 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear2 = nn.Linear(dims, mlp_dims, bias=False)
-        self.linear3 = nn.Linear(mlp_dims, dims, bias=False)
-
-    def forward(self, x, mask=None, cache=None):
-        y = self.norm1(x)
-        y, cache = self.attention(y, y, y, mask, cache)
-        x = x + y
-
-        y = self.norm2(x)
-        a = self.linear1(y)
-        b = self.linear2(y)
-        y = torch.nn.functional.silu(a) * b
-        y = self.linear3(y)
-        x = x + y
-
-        return x, cache
-
-
-@torch.no_grad()
-def measure(model, x, cache):
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-
-    start = time.time()
-    for i in range(5):
-        y, c = model(x, mask=None, cache=cache)
-    sync_if_needed(x)
-    end = time.time()
-    return (end - start) * 1000 / 5
-
-
-if __name__ == "__main__":
-    H = 32
-    D = 4096
-    F = 43 * 256
-    C = 1000
-    device = torch.device("mps")
-    dtype = torch.float16
-
-    layer = LlamaEncoderLayer(D, F, H).to(device).to(dtype)
-    x = torch.randn(1, 1, D).to(device).to(dtype)
-    cache = [
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-        torch.randn(1, H, C, D // H).to(device).to(dtype),
-    ]
-
-    T = measure(layer, x, cache)
-
-    print("Time per layer per token:", T, "ms")
-    print("Lower bound total time per token:", T * 32, "ms")
--- a/benchmarks/python/rms_norm_bench.py
+++ b/benchmarks/python/rms_norm_bench.py
@@ -0,0 +1,39 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import mlx.core as mx
+import mlx.nn as nn
+from time_utils import time_fn
+
+
+def rms_norm(x, w, eps):
+    ot = x.dtype
+    x = x.astype(mx.float32)
+    n = mx.rsqrt(x.square().mean(-1, keepdims=True) + eps)
+    return (x * n).astype(ot) * w
+
+
+def time_rms_norm():
+    f1 = lambda x, w, y: (rms_norm(x, w, 1e-5) * y).sum()
+    f2 = lambda x, w, y: (mx.fast.rms_norm(x, w, 1e-5) * y).sum()
+    g1 = mx.grad(f1, argnums=(0, 1))
+    g2 = mx.grad(f2, argnums=(0, 1))
+
+    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
+    mx.eval(x, w, y)
+
+    def rms_norm_loop(g, x, w):
+        gx, gw = x, w
+        for _ in range(32):
+            gx, gw = g(gx, gw, y)
+        return gx, gw
+
+    time_fn(rms_norm_loop, g1, x, w)
+    time_fn(rms_norm_loop, g2, x, w)
+    time_fn(rms_norm_loop, mx.compile(g1), x, w)
+    time_fn(rms_norm_loop, mx.compile(g2), x, w)
+
+
+if __name__ == "__main__":
+    time_rms_norm()
--- a/benchmarks/python/rope_bench.py
+++ b/benchmarks/python/rope_bench.py
@@ -0,0 +1,35 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import mlx.core as mx
+import mlx.nn as nn
+from time_utils import time_fn
+
+
+def time_rope():
+    rope = nn.RoPE(64)
+
+    # vec
+    x = mx.random.uniform(shape=(1, 32, 1, 128)).astype(mx.float16)
+    mx.eval(x)
+
+    def rope_vec(x):
+        for _ in range(32):
+            x = rope(x, offset=100)
+        return x
+
+    time_fn(rope_vec, x)
+
+    # matrix
+    x = mx.random.uniform(shape=(1, 32, 1024, 128)).astype(mx.float16)
+    mx.eval(x)
+
+    def rope_mat(x):
+        for _ in range(32):
+            x = rope(x)
+        return x
+
+    time_fn(rope_mat, x)
+
+
+if __name__ == "__main__":
+    time_rope()
--- a/benchmarks/python/scatter_bench.py
+++ b/benchmarks/python/scatter_bench.py
@@ -0,0 +1,96 @@
+# Copyright © 2023-2024 Apple Inc.
+
+import argparse
+
+import mlx.core as mx
+import torch
+from time_utils import measure_runtime
+
+
+def benchmark_scatter_mlx(dst_shape, x_shape, idx_shapes):
+    def scatter(dst, x, idx):
+        dst[tuple(idx)] = x
+        mx.eval(dst)
+
+    idx = []
+    for idx_shape in idx_shapes:
+        idx.append(mx.random.randint(0, dst_shape[0] - 1, idx_shape))
+    x = mx.random.normal(x_shape).astype(mx.float32)
+    dst = mx.random.normal(dst_shape).astype(mx.float32)
+
+    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx)
+    print(f"MLX: {runtime:.3f}ms")
+
+
+def benchmark_scatter_torch(dst_shape, x_shape, idx_shapes, device):
+    def scatter(dst, x, idx, device):
+        dst[tuple(idx)] = x
+        if device == torch.device("mps"):
+            torch.mps.synchronize()
+
+    idx = []
+    for idx_shape in idx_shapes:
+        idx.append(torch.randint(0, dst_shape[0] - 1, idx_shape).to(device))
+    x = torch.randn(x_shape, dtype=torch.float32).to(device)
+    dst = torch.randn(dst_shape, dtype=torch.float32).to(device)
+
+    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx, device=device)
+    print(f"PyTorch: {runtime:.3f}ms")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Gather benchmarks.")
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
+    args = parser.parse_args()
+
+    if args.cpu:
+        mx.set_default_device(mx.cpu)
+        device = torch.device("cpu")
+    else:
+        device = torch.device("mps")
+
+    dst_shapes = [
+        (10, 64),
+        (100_000, 64),
+        (1_000_000, 64),
+        (100_000,),
+        (200_000,),
+        (20_000_000,),
+        (10000, 64),
+        (100, 64),
+        (100, 10_000, 64),
+        (10, 100, 100, 21),
+        (1_000, 1_000, 10),
+    ]
+    idx_shapes = [
+        [(1_000_000,)],
+        [(1_000_000,)],
+        [(100_000,)],
+        [(1_000_000,)],
+        [(20_000_000,)],
+        [(20_000_000,)],
+        [(1000000,)],
+        [(10000000,)],
+        [(1_000,)],
+        [(10_000,)],
+        [(1_000,), (1_000,)],
+    ]
+    x_shapes = [
+        (1_000_000, 64),
+        (1_000_000, 64),
+        (100_000, 64),
+        (1_000_000,),
+        (20_000_000,),
+        (20_000_000,),
+        (1000000, 64),
+        (10000000, 64),
+        (1_000, 10_000, 64),
+        (10_000, 100, 100, 21),
+        (1_000, 10),
+    ]
+
+    for dst_shape, x_shape, idx_shape in zip(dst_shapes, x_shapes, idx_shapes):
+        print("=" * 20)
+        print(f"Dst: {dst_shape}, X {x_shape}, Indices {idx_shape}")
+        benchmark_scatter_mlx(dst_shape, x_shape, idx_shape)
+        benchmark_scatter_torch(dst_shape, x_shape, idx_shape, device=device)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -0,0 +1,189 @@
+# Copyright © 2024 Apple Inc.
+
+import argparse
+import math
+import os
+import subprocess
+import time
+
+import mlx.core as mx
+import numpy as np
+
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 5
+N_iter_bench = 40
+N_iter_func = 8
+
+
+def bench(f, *args):
+    for i in range(N_warmup):
+        f(*args)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(*args)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def mlx_sdpa_fused_inner(q, k, v, scale):
+    return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=None)
+
+
+def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):
+    q_dtype = q.dtype
+    q = q * mx.array(scale, q_dtype)
+    n_q_heads = q.shape[-3]
+    n_kv_heads = k.shape[-3]
+    n_repeats = n_q_heads // n_kv_heads
+
+    B = q.shape[0]
+    L = q.shape[2]
+
+    if n_repeats > 1:
+        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
+        k = mx.expand_dims(k, 2)
+        v = mx.expand_dims(v, 2)
+
+    scores = q @ mx.swapaxes(k, -1, -2)
+    if f32softmax:
+        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(q_dtype)
+    else:
+        scores = mx.softmax(scores, axis=-1)
+
+    out = scores @ v
+    if n_repeats > 1:
+        out = mx.reshape(out, [B, n_q_heads, L, -1])
+
+    return out
+
+
+def mlx_spda_unfused(q, k, v, scale, transpose):
+    q_out = q
+    if transpose:
+        k = mx.transpose(k, (0, 2, 1, 3))
+        v = mx.transpose(v, (0, 2, 1, 3))
+
+    for i in range(N_iter_func):
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+        q_out = mlx_sdpa_unfused_inner(q_out, k, v, scale)
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+
+    mx.eval(q_out)
+    return q_out
+
+
+def mlx_spda_fused(q, k, v, scale, transpose):
+    q_out = q
+    if transpose:
+        k = mx.transpose(k, (0, 2, 1, 3))
+        v = mx.transpose(v, (0, 2, 1, 3))
+
+    for i in range(N_iter_func):
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+        q_out = mlx_sdpa_fused_inner(q_out, k, v, scale)
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+
+    mx.eval(q_out)
+    return q_out
+
+
+def bench_shape(B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose=True):
+    shape_q = (
+        (B, qsl, n_q_heads, head_dim) if transpose else (B, n_q_heads, qsl, head_dim)
+    )
+    shape_kv = (
+        (B, ksl, n_kv_heads, head_dim) if transpose else (B, n_kv_heads, ksl, head_dim)
+    )
+
+    q_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_q).astype(np_dtype)
+    k_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
+    v_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
+
+    scale = math.sqrt(1.0 / head_dim)
+
+    q_mx = mx.array(q_np)
+    k_mx = mx.array(k_np)
+    v_mx = mx.array(v_np)
+
+    time_mlx_unfused = bench(mlx_spda_unfused, q_mx, k_mx, v_mx, scale, transpose)
+    time_mlx_fused = bench(mlx_spda_fused, q_mx, k_mx, v_mx, scale, transpose)
+
+    if transpose:
+        q_mx = mx.transpose(q_mx, (0, 2, 1, 3))
+        k_mx = mx.transpose(k_mx, (0, 2, 1, 3))
+        v_mx = mx.transpose(v_mx, (0, 2, 1, 3))
+
+    o_mlx_fused = mlx_sdpa_fused_inner(q_mx, k_mx, v_mx, scale)
+    o_mlx_unfused = mlx_sdpa_unfused_inner(q_mx, k_mx, v_mx, scale, f32softmax=True)
+
+    atol = 1e-5 if np_dtype == np.float32 else 1e-4
+
+    if not mx.allclose(o_mlx_fused, o_mlx_unfused, atol=atol):
+        print(
+            f"Failed at (B: {B}, qsl: {qsl}, ksl: {ksl}, head_dim: {head_dim}, n_qh: {n_q_heads}, n_kvh: {n_kv_heads}) [tpose = {transpose}] with max(|a - b|) = {mx.max(mx.abs(o_mlx_unfused - o_mlx_fused)):3.2e}"
+        )
+
+    return time_mlx_fused, time_mlx_unfused
+
+
+def get_gflop_count(B, M, N, K):
+    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run gemm benchmarks")
+
+    dtypes = ("float16", "float32")[:1]
+    transposes = (False,)
+
+    # fmt: off
+    shapes_64 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,    32,    32,       64,   32,    32),
+          (  1,    64,    64,       64,   32,    32),
+          (  1,   128,   128,       64,   32,    32),
+          (  1,   256,   256,       64,   32,    32),
+          (  1,   512,   512,       64,   32,    32),
+          (  1,  1024,  1024,       64,   32,    32),
+          (  1,  2048,  2048,       64,   32,    32),
+          (  1,  4096,  4096,       64,   32,    32),
+    )
+
+    shapes_80 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,  1024,  1024,       80,   32,    32),
+          (  1,  2048,  2048,       80,   32,    32),
+          (  1,  4096,  4096,       80,   32,    32),
+    )
+
+    shapes_128 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,  1024,  1024,      128,   32,    32),
+          (  1,  2048,  2048,      128,   32,    32),
+          (  1,  4096,  4096,      128,   32,    32),
+    )
+    # fmt: on
+
+    shapes = shapes_64 + shapes_80 + shapes_128
+
+    print("  B,   qsl,   ksl, hdim, n_qh, n_kvh, tpose,   dtype, t_unfs, t_fuse, diff%")
+
+    for dtype in dtypes:
+        for transpose in transposes:
+            for B, qsl, ksl, head_dim, n_q_heads, n_kv_heads in shapes:
+                np_dtype = getattr(np, dtype)
+                time_mlx_fused, time_mlx_unfused = bench_shape(
+                    B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose
+                )
+                diff = time_mlx_unfused / time_mlx_fused - 1.0
+                t_str = 1 if transpose else 0
+                print(
+                    f"{B:3d}, {qsl:5d}, {ksl:5d}, {head_dim:4d}, {n_q_heads:4d}, {n_kv_heads:5d}, {t_str:5d}, {dtype}, {time_mlx_unfused: 2.3f}, {time_mlx_fused: 2.3f}, {100. * diff:+5.2f}%"
+                )
--- a/benchmarks/python/sdpa_vector_bench.py
+++ b/benchmarks/python/sdpa_vector_bench.py
@@ -0,0 +1,81 @@
+import argparse
+import math
+
+import mlx.core as mx
+from time_utils import time_fn
+
+L = 16384
+H = 32
+H_k = H // 4
+D = 128
+dtype = mx.float16
+loops = 10
+
+
+def attention(q, k, v, mask=None):
+    def _sdpa(q, k, v):
+        B, Hq, L, D = q.shape
+        _, Hk, S, _ = k.shape
+        q = q.reshape(B, Hk, Hq // Hk, L, D)
+        k = k[:, :, None, :, :]
+        v = v[:, :, None, :, :]
+        s = q @ k.transpose(0, 1, 2, 4, 3)
+        if mask is not None:
+            m = mx.broadcast_to(mask, (B, Hq, L, S)).reshape(B, Hk, Hq // Hk, L, S)
+            s = mx.where(m, s, mx.finfo(s.dtype).min)
+        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
+        o = p @ v
+        return o.reshape(B, Hq, L, D)
+
+    for i in range(loops):
+        q = _sdpa(q, k, v)
+    return q
+
+
+def sdpa(q, k, v, mask=None):
+    for i in range(loops):
+        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
+    return q
+
+
+def time_self_attention_primitives():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(attention, q, k, v)
+
+
+def time_self_attention_sdpa():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(sdpa, q, k, v)
+
+
+def time_self_attention_sdpa_with_mask():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mask = mx.full((L,), True)
+    mask[L // 2 :] = False
+    mx.eval(q, k, v, mask)
+
+    def sdpa_mask(*args):
+        return sdpa(*args, mask=mask)
+
+    def attention_mask(*args):
+        return attention(*args, mask=mask)
+
+    time_fn(attention_mask, q, k, v)
+    time_fn(sdpa_mask, q, k, v)
+
+
+if __name__ == "__main__":
+    time_self_attention_sdpa()
+    time_self_attention_primitives()
+    time_self_attention_sdpa_with_mask()
--- a/benchmarks/python/single_ops.py
+++ b/benchmarks/python/single_ops.py
@@ -1,8 +1,8 @@
 # Copyright © 2023 Apple Inc.

 import argparse
-import mlx.core as mx

+import mlx.core as mx
 from time_utils import time_fn


@@ -44,6 +44,13 @@ def time_matmul():
    time_fn(mx.matmul, a, b)


+def time_maximum():
+    a = mx.random.uniform(shape=(32, 1024, 1024))
+    b = mx.random.uniform(shape=(32, 1024, 1024))
+    mx.eval(a, b)
+    time_fn(mx.maximum, a, b)
+
+
 def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)
@@ -101,6 +108,7 @@ if __name__ == "__main__":

    time_add()
    time_matmul()
+    time_maximum()
    time_exp()
    time_negative()
    time_logsumexp()
--- a/benchmarks/python/time_utils.py
+++ b/benchmarks/python/time_utils.py
@@ -1,4 +1,4 @@
-# Copyright © 2023 Apple Inc.
+# Copyright © 2023-2024 Apple Inc.

 import time

@@ -6,7 +6,11 @@ import mlx.core as mx


 def time_fn(fn, *args, **kwargs):
-    print(f"Timing {fn.__name__} ...", end=" ")
+    msg = kwargs.pop("msg", None)
+    if msg:
+        print(f"Timing {msg} ...", end=" ")
+    else:
+        print(f"Timing {fn.__name__} ...", end=" ")

    # warmup
    for _ in range(5):
@@ -20,3 +24,15 @@ def time_fn(fn, *args, **kwargs):

    msec = 1e3 * (toc - tic) / num_iters
    print(f"{msec:.5f} msec")
+
+
+def measure_runtime(fn, **kwargs):
+    # Warmup
+    for _ in range(5):
+        fn(**kwargs)
+
+    tic = time.time()
+    iters = 100
+    for _ in range(iters):
+        fn(**kwargs)
+    return (time.time() - tic) * 1000 / iters
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -1,56 +1,41 @@
 include(CMakeParseArguments)

-###############################################################################
+# ##############################################################################
 # Build metal library
 #
 # Adds a custom target ${TARGET} to build ${OUTPUT_DIRECTORY}/{TITLE}.metallib
 # from list ${SOURCES}, including list ${INCLUDE_DIRS}, depends on list ${DEPS}
 #
-# Args:
-#     TARGET: Custom target to be added for the metal library 
-#     TITLE: Name of the .metallib
-#     OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib
-#     SOURCES: List of source files
-#     INCLUDE_DIRS: List of include dirs
-#     DEPS: List of depedency files (like headers)
+# Args: TARGET: Custom target to be added for the metal library TITLE: Name of
+# the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
+# of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
+# files (like headers)
 #
 macro(mlx_build_metallib)
  # Parse args
  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
-  cmake_parse_arguments(
-      MTLLIB 
-      ""
-      "${oneValueArgs}"
-      "${multiValueArgs}" 
-      ${ARGN}
-  )
+  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  # Set output
  set(MTLLIB_BUILD_TARGET "${MTLLIB_OUTPUT_DIRECTORY}/${MTLLIB_TITLE}.metallib")

-  # Collect compile options 
+  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math)

-  # Prepare metllib build command
+  # Prepare metallib build command
  add_custom_command(
    OUTPUT ${MTLLIB_BUILD_TARGET}
-    COMMAND xcrun -sdk macosx metal 
-                  "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
-                  ${MTLLIB_COMPILE_OPTIONS}
-                  ${MTLLIB_SOURCES}
-                  -o ${MTLLIB_BUILD_TARGET}
+    COMMAND
+      xcrun -sdk macosx metal
+      "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
+      ${MTLLIB_COMPILE_OPTIONS} ${MTLLIB_SOURCES} -o ${MTLLIB_BUILD_TARGET}
    DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
    COMMAND_EXPAND_LISTS
    COMMENT "Building ${MTLLIB_TITLE}.metallib"
-    VERBATIM
-  )
+    VERBATIM)

  # Add metallib custom target
-  add_custom_target(
-    ${MTLLIB_TARGET}
-    DEPENDS
-    ${MTLLIB_BUILD_TARGET}
-  )
+  add_custom_target(${MTLLIB_TARGET} DEPENDS ${MTLLIB_BUILD_TARGET})

-endmacro(mlx_build_metallib)
+endmacro(mlx_build_metallib)
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1 +1,3 @@
 src/python/_autosummary*/
+src/python/nn/_autosummary*/
+src/python/optimizers/_autosummary*/
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -0,0 +1,50 @@
+################################################################################
+# Primary project setup.                                                       #
+################################################################################
+
+PROJECT_NAME           = "MLX"
+OUTPUT_DIRECTORY       = build
+XML_OUTPUT             = xml
+HTML_OUTPUT            = html
+STRIP_FROM_PATH        = ../
+INPUT                  = ../mlx
+FILE_PATTERNS          = *.h
+EXCLUDE_PATTERNS       = */private/*
+CREATE_SUBDIRS         = NO
+FULL_PATH_NAMES        = YES
+RECURSIVE              = YES
+GENERATE_HTML          = YES
+GENERATE_LATEX         = NO
+GENERATE_XML           = YES
+XML_PROGRAMLISTING     = YES
+
+################################################################################
+# Doxygen preprocessor / parser control.                                       #
+################################################################################
+
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+SKIP_FUNCTION_MACROS   = NO
+
+################################################################################
+# Compound extraction control.                                                 #
+################################################################################
+
+EXTRACT_ALL            = YES
+EXTRACT_PACKAGE        = YES
+EXTRACT_STATIC         = YES
+CASE_SENSE_NAMES       = NO
+
+################################################################################
+# Docstring control / customization.                                           #
+################################################################################
+
+JAVADOC_AUTOBRIEF      = YES
+
+################################################################################
+# Warning suppression.                                                         #
+################################################################################
+
+QUIET                  = YES
+WARN_IF_UNDOCUMENTED   = NO
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,12 +2,16 @@

 ### Setup (do once)

-Install [sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html)
-for example with `conda`:
+Install Doxygen:

 ```
-conda install sphinx
-pip install sphinx-book-theme
+brew install doxygen
+```
+
+Install Python packages:
+
+```
+pip install -r requirements.txt
 ```

 ### Build
@@ -15,7 +19,7 @@ pip install sphinx-book-theme
 Build the docs from `mlx/docs/`

 ```
-make html
+doxygen && make html
 ```

 View the docs by running a server in `mlx/docs/build/html/`:
@@ -26,7 +30,7 @@ python -m http.server <port>

 and point your browser to `http://localhost:<port>`.

-### Push to Github Pages
+### Push to GitHub Pages

 Check-out the `gh-pages` branch (`git switch gh-pages`) and build
 the docs. Then force add the `build/html` directory:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+sphinx
+breathe
+sphinx-book-theme
+mlx
--- a/docs/src/_static/metal_debugger/capture.png
+++ b/docs/src/_static/metal_debugger/capture.png
--- a/docs/src/_static/metal_debugger/schema.png
+++ b/docs/src/_static/metal_debugger/schema.png
--- a/docs/src/_static/mlx_logo.png
+++ b/docs/src/_static/mlx_logo.png
--- a/docs/src/_static/mlx_logo_dark.png
+++ b/docs/src/_static/mlx_logo_dark.png
--- a/docs/src/_templates/module-base-class.rst
+++ b/docs/src/_templates/module-base-class.rst
@@ -0,0 +1,33 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. add toctree option to make autodoc generate the pages
+
+.. autoclass:: {{ objname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: .
+   {% for item in attributes %}
+      ~{{ fullname }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: .
+   {% for item in methods %}
+      {%- if item not in inherited_members and item != '__init__' %}
+      ~{{ fullname }}.{{ item }}
+      {%- endif -%}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
--- a/docs/src/_templates/nn-module-template.rst
+++ b/docs/src/_templates/nn-module-template.rst
@@ -4,16 +4,17 @@

 .. autoclass:: {{ objname }}

-   {#{% block methods %}
+   {% block methods %}

   {% if methods %}
   .. rubric:: {{ _('Methods') }}

   .. autosummary::
   {% for item in methods %}
-      {%- if item not in inherited_members and item != '__init__' %}
+      {%- if item not in inherited_members and item != "__init__" %}
         ~{{ name }}.{{ item }}
      {%- endif %}
   {%- endfor %}
   {% endif %}
-   {% endblock %}#}
+   {% endblock %}
+
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -5,13 +5,15 @@
 import os
 import subprocess

+import mlx.core as mx
+
 # -- Project information -----------------------------------------------------

 project = "MLX"
 copyright = "2023, MLX Contributors"
 author = "MLX Contributors"
-version = "0.0.4"
-release = "0.0.4"
+version = ".".join(mx.__version__.split(".")[:3])
+release = version

 # -- General configuration ---------------------------------------------------

@@ -20,22 +22,28 @@ extensions = [
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
    "sphinx.ext.napoleon",
+    "breathe",
 ]

 python_use_unqualified_type_names = True
 autosummary_generate = True
+autosummary_filename_map = {"mlx.core.Stream": "stream_class"}

 intersphinx_mapping = {
-    "https://docs.python.org/3": None,
-    "https://numpy.org/doc/stable/": None,
+    "python": ("https://docs.python.org/3", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
 }

+breathe_projects = {"mlx": "../build/xml"}
+breathe_default_project = "mlx"
+
 templates_path = ["_templates"]
 html_static_path = ["_static"]
 source_suffix = ".rst"
-master_doc = "index"
+main_doc = "index"
 highlight_language = "python"
 pygments_style = "sphinx"
+add_module_names = False

 # -- Options for HTML output -------------------------------------------------

@@ -46,11 +54,45 @@ html_theme_options = {
    "repository_url": "https://github.com/ml-explore/mlx",
    "use_repository_button": True,
    "navigation_with_keys": False,
+    "logo": {
+        "image_light": "_static/mlx_logo.png",
+        "image_dark": "_static/mlx_logo_dark.png",
+    },
 }

-html_logo = "_static/mlx_logo.png"
-
+html_favicon = html_theme_options["logo"]["image_light"]

 # -- Options for HTMLHelp output ---------------------------------------------

 htmlhelp_basename = "mlx_doc"
+
+
+def setup(app):
+    from sphinx.util import inspect
+
+    wrapped_isfunc = inspect.isfunction
+
+    def isfunc(obj):
+        type_name = str(type(obj))
+        if "nanobind.nb_method" in type_name or "nanobind.nb_func" in type_name:
+            return True
+        return wrapped_isfunc(obj)
+
+    inspect.isfunction = isfunc
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_documents = [(main_doc, "MLX.tex", "MLX Documentation", author, "manual")]
+latex_elements = {
+    "preamble": r"""
+    \usepackage{enumitem}
+    \setlistdepth{5}
+    \setlist[itemize,1]{label=$\bullet$}
+    \setlist[itemize,2]{label=$\bullet$}
+    \setlist[itemize,3]{label=$\bullet$}
+    \setlist[itemize,4]{label=$\bullet$}
+    \setlist[itemize,5]{label=$\bullet$}
+    \renewlist{itemize}{itemize}{5}
+""",
+}
--- a/docs/src/cpp/ops.rst
+++ b/docs/src/cpp/ops.rst
@@ -3,4 +3,5 @@
 Operations
 ==========

-
+.. doxygengroup:: ops
+   :content-only:
--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -0,0 +1,427 @@
+.. _custom_metal_kernels:
+
+Custom Metal Kernels
+====================
+
+MLX supports writing custom Metal kernels through the Python and C++ APIs.
+
+Simple Example
+--------------
+
+Let's write a custom kernel that computes ``exp`` elementwise:
+
+.. code-block:: python
+
+  def exp_elementwise(a: mx.array):
+      source = """
+          uint elem = thread_position_in_grid.x;
+          T tmp = inp[elem];
+          out[elem] = metal::exp(tmp);
+      """
+
+      kernel = mx.fast.metal_kernel(
+          name="myexp",
+          input_names=["inp"],
+          output_names=["out"],
+          source=source,
+      )
+      outputs = kernel(
+          inputs=[a],
+          template=[("T", mx.float32)],
+          grid=(a.size, 1, 1),
+          threadgroup=(256, 1, 1),
+          output_shapes=[a.shape],
+          output_dtypes=[a.dtype],
+      )
+      return outputs[0]
+
+  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
+  b = exp_elementwise(a)
+  assert mx.allclose(b, mx.exp(a))
+
+.. note::
+    We are only required to pass the body of the Metal kernel in ``source``.
+
+The full function signature will be generated using:
+
+* The shapes/dtypes of ``inputs``
+    In the above, ``a`` is an ``mx.array`` of type ``mx.float16`` and we pass it with the key ``inp``
+    so we will add ``const device float16_t* inp`` to the signature.
+    ``inp_shape``, ``inp_strides`` and ``inp_ndim`` are also added for convenience if they are present
+    in ``source``.
+* The list of ``output_dtypes``
+    In the above, ``out`` is an ``mx.array`` of type ``mx.float16``
+    so we add ``device float16_t* out``.
+* Template parameters passed using ``template``
+    In the above, ``template=[("T", mx.float32)]`` adds a template of ``template <typename T>`` to the function
+    and instantiates the template with ``custom_kernel_myexp_float<float>``.
+    Template parameters can be ``mx.core.Dtype``, ``int`` or ``bool``.
+* Metal attributes used in ``source`` such as ``[[thread_position_in_grid]]``
+    These will be added as function arguments.
+    All the attributes defined in Table 5.8 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ are supported.
+
+Putting this all together, the generated function signature for ``myexp`` is as follows:
+
+.. code-block:: cpp
+
+  template <typename T>
+  [[kernel]] void custom_kernel_myexp_float(
+    const device float16_t* inp [[buffer(0)]],
+    device float16_t* out [[buffer(1)]],
+    uint3 thread_position_in_grid [[thread_position_in_grid]]) {
+
+          uint elem = thread_position_in_grid.x;
+          T tmp = inp[elem];
+          out[elem] = metal::exp(tmp);
+
+  }
+
+  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;
+
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
+This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
+For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+
+Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
+
+Using Shape/Strides
+-------------------
+
+``mx.fast.metal_kernel`` supports an argument ``ensure_row_contiguous`` which is ``True`` by default.
+This will copy the ``mx.array`` inputs if needed before the kernel is launched to ensure that the memory layout is row contiguous.
+Generally this makes writing the kernel easier, since we don't have to worry about gaps or the ordering of the dims
+when indexing.
+
+If we want to avoid this copy, ``metal_kernel`` automatically passes ``a_shape``, ``a_strides`` and ``a_ndim`` for each
+input array ``a`` if any are present in ``source``.
+We can then use MLX's built in indexing utils to fetch the right elements for each thread.
+
+Let's convert ``myexp`` above to support arbitrarily strided arrays without relying on a copy from ``ensure_row_contiguous``:
+
+.. code-block:: python
+
+  def exp_elementwise(a: mx.array):
+      source = """
+          uint elem = thread_position_in_grid.x;
+          // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
+          uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
+          T tmp = inp[loc];
+          // Output arrays are always row contiguous
+          out[elem] = metal::exp(tmp);
+      """
+
+      kernel = mx.fast.metal_kernel(
+          name="myexp_strided",
+          input_names=["inp"],
+          output_names=["out"],
+          source=source
+      )
+      outputs = kernel(
+          inputs=[a],
+          template=[("T", mx.float32)],
+          grid=(a.size, 1, 1),
+          threadgroup=(256, 1, 1),
+          output_shapes=[a.shape],
+          output_dtypes=[a.dtype],
+          ensure_row_contiguous=False,
+      )
+      return outputs[0]
+
+  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
+  # make non-contiguous
+  a = a[::2]
+  b = exp_elementwise(a)
+  assert mx.allclose(b, mx.exp(a))
+
+Complex Example
+-----------------------------
+
+Let's implement a more complex example: ``grid_sample`` in ``"bilinear"`` mode.
+
+We'll start with the following MLX implementation using standard ops:
+
+.. code-block:: python
+
+    def grid_sample_ref(x, grid):
+        N, H_in, W_in, _ = x.shape
+        ix = ((grid[..., 0] + 1) * W_in - 1) / 2
+        iy = ((grid[..., 1] + 1) * H_in - 1) / 2
+
+        ix_nw = mx.floor(ix).astype(mx.int32)
+        iy_nw = mx.floor(iy).astype(mx.int32)
+
+        ix_ne = ix_nw + 1
+        iy_ne = iy_nw
+
+        ix_sw = ix_nw
+        iy_sw = iy_nw + 1
+
+        ix_se = ix_nw + 1
+        iy_se = iy_nw + 1
+
+        nw = (ix_se - ix)    * (iy_se - iy)
+        ne = (ix    - ix_sw) * (iy_sw - iy)
+        sw = (ix_ne - ix)    * (iy    - iy_ne)
+        se = (ix    - ix_nw) * (iy    - iy_nw)
+
+        I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
+        I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
+        I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
+        I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]
+
+        mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
+        mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
+        mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
+        mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)
+
+        I_nw *= mask_nw[..., None]
+        I_ne *= mask_ne[..., None]
+        I_sw *= mask_sw[..., None]
+        I_se *= mask_se[..., None]
+
+        output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se
+
+        return output
+
+Now let's use ``mx.custom_function`` together with ``mx.fast.metal_kernel``
+to write a fast GPU kernel for both the forward and backward passes.
+
+First we'll implement the forward pass as a fused kernel:
+
+.. code-block:: python
+
+    @mx.custom_function
+    def grid_sample(x, grid):
+
+        assert x.ndim == 4, "`x` must be 4D."
+        assert grid.ndim == 4, "`grid` must be 4D."
+
+        B, _, _, C = x.shape
+        _, gN, gM, D = grid.shape
+        out_shape = (B, gN, gM, C)
+
+        assert D == 2, "Last dim of `grid` must be size 2."
+
+        source = """
+            uint elem = thread_position_in_grid.x;
+            int H = x_shape[1];
+            int W = x_shape[2];
+            int C = x_shape[3];
+            int gH = grid_shape[1];
+            int gW = grid_shape[2];
+
+            int w_stride = C;
+            int h_stride = W * w_stride;
+            int b_stride = H * h_stride;
+
+            uint grid_idx = elem / C * 2;
+            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+
+            int ix_nw = floor(ix);
+            int iy_nw = floor(iy);
+
+            int ix_ne = ix_nw + 1;
+            int iy_ne = iy_nw;
+
+            int ix_sw = ix_nw;
+            int iy_sw = iy_nw + 1;
+
+            int ix_se = ix_nw + 1;
+            int iy_se = iy_nw + 1;
+
+            T nw = (ix_se - ix)    * (iy_se - iy);
+            T ne = (ix    - ix_sw) * (iy_sw - iy);
+            T sw = (ix_ne - ix)    * (iy    - iy_ne);
+            T se = (ix    - ix_nw) * (iy    - iy_nw);
+
+            int batch_idx = elem / C / gH / gW * b_stride;
+            int channel_idx = elem % C;
+            int base_idx = batch_idx + channel_idx;
+
+            T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
+            T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
+            T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
+            T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];
+
+            I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
+            I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
+            I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
+            I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;
+
+            out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
+        """
+        kernel = mx.fast.metal_kernel(
+            name="grid_sample",
+            input_names=["x", "grid"],
+            output_names=["out"],
+            source=source,
+        )
+        outputs = kernel(
+            inputs=[x, grid],
+            template=[("T", x.dtype)],
+            output_shapes=[out_shape],
+            output_dtypes=[x.dtype],
+            grid=(np.prod(out_shape), 1, 1),
+            threadgroup=(256, 1, 1),
+        )
+        return outputs[0]
+
+For a reasonably sized input such as:
+
+.. code-block:: python
+
+    x.shape = (8, 1024, 1024, 64)
+    grid.shape = (8, 256, 256, 2)
+
+On an M1 Max, we see a big performance improvement:
+
+``55.7ms -> 6.7ms => 8x speed up``
+
+Grid Sample VJP
+---------------
+
+Since we decorated ``grid_sample`` with ``mx.custom_function``, we can now define
+its custom vjp transform so MLX can differentiate it.
+
+The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
+requires a few extra ``mx.fast.metal_kernel`` features:
+
+* ``init_value=0``
+    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.
+
+* ``atomic_outputs=True``
+    Designate all of the kernel outputs as ``atomic`` in the function signature. 
+    This means we can use Metal's ``atomic`` features to simultaneously update the ``x_grad`` and ``grid_grad`` arrays from multiple threadgroups. 
+    See section 6.15 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ for more details.
+
+We can then implement the backwards pass as follows:
+
+.. code-block:: python
+
+    @grid_sample.vjp
+    def grid_sample_vjp(primals, cotangent, _):
+        x, grid = primals
+        B, _, _, C = x.shape
+        _, gN, gM, D = grid.shape
+
+        assert D == 2, "Last dim of `grid` must be size 2."
+
+        source = """
+            uint elem = thread_position_in_grid.x;
+            int H = x_shape[1];
+            int W = x_shape[2];
+            int C = x_shape[3];
+            // Pad C to the nearest larger simdgroup size multiple
+            int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;
+
+            int gH = grid_shape[1];
+            int gW = grid_shape[2];
+
+            int w_stride = C;
+            int h_stride = W * w_stride;
+            int b_stride = H * h_stride;
+
+            uint grid_idx = elem / C_padded * 2;
+            float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
+            float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;
+
+            int ix_nw = floor(ix);
+            int iy_nw = floor(iy);
+
+            int ix_ne = ix_nw + 1;
+            int iy_ne = iy_nw;
+
+            int ix_sw = ix_nw;
+            int iy_sw = iy_nw + 1;
+
+            int ix_se = ix_nw + 1;
+            int iy_se = iy_nw + 1;
+
+            T nw = (ix_se - ix)    * (iy_se - iy);
+            T ne = (ix    - ix_sw) * (iy_sw - iy);
+            T sw = (ix_ne - ix)    * (iy    - iy_ne);
+            T se = (ix    - ix_nw) * (iy    - iy_nw);
+
+            int batch_idx = elem / C_padded / gH / gW * b_stride;
+            int channel_idx = elem % C_padded;
+            int base_idx = batch_idx + channel_idx;
+
+            T gix = T(0);
+            T giy = T(0);
+            if (channel_idx < C) {
+                int cot_index = elem / C_padded * C + channel_idx;
+                T cot = cotangent[cot_index];
+                if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
+                    int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);
+
+                    T I_nw = x[offset];
+                    gix -= I_nw * (iy_se - iy) * cot;
+                    giy -= I_nw * (ix_se - ix) * cot;
+                }
+                if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
+                    int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);
+
+                    T I_ne = x[offset];
+                    gix += I_ne * (iy_sw - iy) * cot;
+                    giy -= I_ne * (ix - ix_sw) * cot;
+                }
+                if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
+                    int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);
+
+                    T I_sw = x[offset];
+                    gix -= I_sw * (iy - iy_ne) * cot;
+                    giy += I_sw * (ix_ne - ix) * cot;
+                }
+                if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
+                    int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
+                    atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);
+
+                    T I_se = x[offset];
+                    gix += I_se * (iy - iy_nw) * cot;
+                    giy += I_se * (ix - ix_nw) * cot;
+                }
+            }
+
+            T gix_mult = W / 2;
+            T giy_mult = H / 2;
+
+            // Reduce across each simdgroup first.
+            // This is much faster than relying purely on atomics.
+            gix = simd_sum(gix);
+            giy = simd_sum(giy);
+
+            if (thread_index_in_simdgroup == 0) {
+                atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
+                atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
+            }
+        """
+        kernel = mx.fast.metal_kernel(
+            name="grid_sample_grad",
+            input_names=["x", "grid", "cotangent"],
+            output_names=["x_grad", "grid_grad"],
+            source=source,
+            atomic_outputs=True,
+        )
+        # pad the output channels to simd group size
+        # so that our `simd_sum`s don't overlap.
+        simdgroup_size = 32
+        C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
+        grid_size = B * gN * gM * C_padded
+        outputs = kernel(
+            inputs=[x, grid, cotangent],
+            template=[("T", x.dtype)],
+            output_shapes=[x.shape, grid.shape],
+            output_dtypes=[x.dtype, x.dtype],
+            grid=(grid_size, 1, 1),
+            threadgroup=(256, 1, 1),
+            init_value=0,
+        )
+        return outputs[0], outputs[1]
+
+There's an even larger speed up for the vjp:
+
+``676.4ms -> 16.7ms => 40x speed up``
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -1,24 +1,16 @@
-Developer Documentation
-=======================
+Custom Extensions in MLX
+========================

-MLX provides a open and flexible backend to which users may add operations 
-and specialized implementations without much hassle. While the library supplies
-efficient operations that can be used and composed for any number of 
-applications, there may arise cases where new functionalities or highly 
-optimized implementations are needed. For such cases, you may design and 
-implement your own operations that link to and build on top of :mod:`mlx.core`.
-We will introduce the inner-workings of MLX and go over a simple example to 
-learn the steps involved in adding new operations to MLX with your own CPU 
-and GPU implementations. 
+You can extend MLX with custom operations on the CPU or GPU. This guide
+explains how to do that with a simple example.

 Introducing the Example
 -----------------------

-Let's say that you would like an operation that takes in two arrays, 
-``x`` and ``y``, scales them both by some coefficents ``alpha`` and ``beta``
-respectively, and then adds them together to get the result 
-``z = alpha * x + beta * y``. Well, you can very easily do that by just 
-writing out a function as follows:
+Let's say you would like an operation that takes in two arrays, ``x`` and
+``y``, scales them both by coefficients ``alpha`` and ``beta`` respectively,
+and then adds them together to get the result ``z = alpha * x + beta * y``.
+You can do that in MLX directly:

 .. code-block:: python

@@ -27,49 +19,40 @@ writing out a function as follows:
    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
        return alpha * x + beta * y

-This function performs that operation while leaving the implementations and 
-differentiation to MLX. 
+This function performs that operation while leaving the implementation and
+function transformations to MLX.

-However, you work with vector math libraries often and realize that the 
-``axpby`` routine defines the same operation ``Y = (alpha * X) + (beta * Y)``. 
-You would really like the part of your applications that does this operation 
-on the CPU to be very fast - so you decide that you want it to rely on the 
-``axpby`` routine provided by the Accelerate_ framework. Continuing to impose 
-our assumptions on to you, let's also assume that you want to learn how add 
-your own implementation for the gradients of your new operation while going 
-over the ins-and-outs of the MLX framework. 
+However you may need to customize the underlying implementation, perhaps to
+make it faster or for custom differentiation. In this tutorial we will go
+through adding custom extensions. It will cover:

-Well, what a coincidence! You are in the right place. Over the course of this 
-example, we will learn:
-
-* The structure of the MLX library from the frontend API to the backend implementations.
-* How to implement your own CPU backend that redirects to Accelerate_ when appropriate (and a fallback if needed).
-* How to implement your own GPU implementation using metal.
-* How to add your own ``vjp`` and ``jvp``.
-* How to build your implementations, link them to MLX, and bind them to python.
+* The structure of the MLX library.
+* Implementing a CPU operation that redirects to Accelerate_ when appropriate.
+* Implementing a GPU operation using metal.
+* Adding the ``vjp`` and ``jvp`` function transformation.
+* Building a custom extension and binding it to python.

 Operations and Primitives
 -------------------------

-In one sentence, operations in MLX build the computation graph, and primitives 
-provide the rules for evaluation and transformations of said graph. Let's start 
-by discussing operations in more detail. 
+Operations in MLX build the computation graph. Primitives provide the rules for
+evaluating and transforming the graph. Let's start by discussing operations in
+more detail.

 Operations
 ^^^^^^^^^^^

-Operations are the frontend functions that operate on arrays. They are defined 
-in the C++ API (:ref:`cpp_ops`) and then we provide bindings to these 
-operations in the Python API (:ref:`ops`). 
+Operations are the front-end functions that operate on arrays. They are defined
+in the C++ API (:ref:`cpp_ops`), and the Python API (:ref:`ops`) binds them.

-We would like an operation, :meth:`axpby` that takes in two arrays ``x`` and ``y``,
-and two scalars, ``alpha`` and ``beta``. This is how we would define it in the 
-C++ API:
+We would like an operation, :meth:`axpby` that takes in two arrays ``x`` and
+``y``, and two scalars, ``alpha`` and ``beta``. This is how to define it in
+C++:

 .. code-block:: C++

    /**
-    *  Scale and sum two vectors elementwise
+    *  Scale and sum two vectors element-wise
    *  z = alpha * x + beta * y
    *
    *  Follow numpy style broadcasting between x and y
@@ -83,10 +66,7 @@ C++ API:
        StreamOrDevice s = {} // Stream on which to schedule the operation
    );

-
-This operation itself can call other operations within it if needed. So, the 
-simplest way to go about implementing this operation would be do so in terms 
-of existing operations. 
+The simplest way to this operation is in terms of existing operations:

 .. code-block:: C++

@@ -100,25 +80,23 @@ of existing operations.
        // Scale x and y on the provided stream
        auto ax = multiply(array(alpha), x, s);
        auto by = multiply(array(beta), y, s);
-        
+
        // Add and return
        return add(ax, by, s);
    }

-However, as we discussed earlier, this is not our goal. The operations themselves 
-do not contain the implementations that act on the data, nor do they contain the
-rules of transformations. Rather, they are an easy to use interface that build 
-on top of the building blocks we call :class:`Primitive`. 
+The operations themselves do not contain the implementations that act on the
+data, nor do they contain the rules of transformations. Rather, they are an
+easy to use interface that use :class:`Primitive` building blocks.

 Primitives
 ^^^^^^^^^^^

-A :class:`Primitive` is part of the computation graph of an :class:`array`. It 
-defines how to create an output given a set of input :class:`array` . Further,
-a :class:`Primitive` is a class that contains rules on how it is evaluated 
-on the CPU or GPU, and how it acts under transformations such as ``vjp`` and 
-``jvp``. These words on their own can be a bit abstract, so lets take a step 
-back and go to our example to give ourselves a more concrete image. 
+A :class:`Primitive` is part of the computation graph of an :class:`array`. It
+defines how to create outputs arrays given a input arrays. Further, a
+:class:`Primitive` has methods to run on the CPU or GPU and for function
+transformations such as ``vjp`` and ``jvp``.  Lets go back to our example to be
+more concrete:

 .. code-block:: C++

@@ -134,11 +112,15 @@ back and go to our example to give ourselves a more concrete image.
        * To avoid unnecessary allocations, the evaluation function
        * is responsible for allocating space for the array.
        */
-        void eval_cpu(const std::vector<array>& inputs, array& out) override;
-        void eval_gpu(const std::vector<array>& inputs, array& out) override;
+        void eval_cpu(
+            const std::vector<array>& inputs,
+            std::vector<array>& outputs) override;
+        void eval_gpu(
+            const std::vector<array>& inputs,
+            std::vector<array>& outputs) override;

        /** The Jacobian-vector product. */
-        array jvp(
+        std::vector<array> jvp(
            const std::vector<array>& primals,
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) override;
@@ -147,15 +129,16 @@ back and go to our example to give ourselves a more concrete image.
        std::vector<array> vjp(
            const std::vector<array>& primals,
            const array& cotan,
-            const std::vector<int>& argnums) override;
+            const std::vector<int>& argnums,
+            const std::vector<array>& outputs) override;

        /**
-        * The primitive must know how to vectorize itself accross
+        * The primitive must know how to vectorize itself across
        * the given axes. The output is a pair containing the array
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
-        std::pair<array, int> vmap(
+        virtual std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

@@ -175,22 +158,22 @@ back and go to our example to give ourselves a more concrete image.
        void eval(const std::vector<array>& inputs, array& out);
    };

-The :class:`Axpby` class derives from the base :class:`Primitive` class and 
-follows the above demonstrated interface. :class:`Axpby` treats ``alpha`` and 
-``beta`` as parameters. It then provides implementations of how the array ``out`` 
-is produced given ``inputs`` through :meth:`Axpby::eval_cpu` and 
-:meth:`Axpby::eval_gpu`. Further, it provides rules of transformations in 
-:meth:`Axpby::jvp`, :meth:`Axpby::vjp`, and :meth:`Axpby::vmap`. 
+The :class:`Axpby` class derives from the base :class:`Primitive` class. The
+:class:`Axpby` treats ``alpha`` and ``beta`` as parameters. It then provides
+implementations of how the output array is produced given the inputs through
+:meth:`Axpby::eval_cpu` and :meth:`Axpby::eval_gpu`. It also provides rules
+of transformations in :meth:`Axpby::jvp`, :meth:`Axpby::vjp`, and
+:meth:`Axpby::vmap`.

-Using the Primitives
-^^^^^^^^^^^^^^^^^^^^^
+Using the Primitive
+^^^^^^^^^^^^^^^^^^^

-Operations can use this :class:`Primitive` to add a new :class:`array` to 
-the computation graph. An :class:`array` can be constructed by providing its 
-data type, shape, the :class:`Primitive` that computes it, and the 
-:class:`array` inputs that are passed to the primitive.
+Operations can use this :class:`Primitive` to add a new :class:`array` to the
+computation graph. An :class:`array` can be constructed by providing its data
+type, shape, the :class:`Primitive` that computes it, and the :class:`array`
+inputs that are passed to the primitive.

-Let's re-implement our operation now in terms of our :class:`Axpby` primitive.
+Let's reimplement our operation now in terms of our :class:`Axpby` primitive.

 .. code-block:: C++

@@ -223,14 +206,14 @@ Let's re-implement our operation now in terms of our :class:`Axpby` primitive.
            /* const std::vector<int>& shape = */ out_shape,
            /* Dtype dtype = */ out_dtype,
            /* std::unique_ptr<Primitive> primitive = */
-            std::make_unique<Axpby>(to_stream(s), alpha, beta),
+            std::make_shared<Axpby>(to_stream(s), alpha, beta),
            /* const std::vector<array>& inputs = */ broadcasted_inputs);
    }


 This operation now handles the following:

-#. Upcast inputs and resolve the the output data type.
+#. Upcast inputs and resolve the output data type.
 #. Broadcast the inputs and resolve the output shape.
 #. Construct the primitive :class:`Axpby` using the given stream, ``alpha``, and ``beta``.
 #. Construct the output :class:`array` using the primitive and the inputs.
@@ -238,27 +221,26 @@ This operation now handles the following:
 Implementing the Primitive
 --------------------------

-No computation happens when we call the operation alone. In effect, the 
-operation only builds the computation graph. When we evaluate the output 
-array, MLX schedules the execution of the computation graph, and calls
-:meth:`Axpby::eval_cpu` or :meth:`Axpby::eval_gpu` depending on the 
-stream/device specified by the user. 
+No computation happens when we call the operation alone. The operation only
+builds the computation graph. When we evaluate the output array, MLX schedules
+the execution of the computation graph, and calls :meth:`Axpby::eval_cpu` or
+:meth:`Axpby::eval_gpu` depending on the stream/device specified by the user.

 .. warning::
    When :meth:`Primitive::eval_cpu` or :meth:`Primitive::eval_gpu` are called,
    no memory has been allocated for the output array. It falls on the implementation
-    of these functions to allocate memory as needed
+    of these functions to allocate memory as needed.

-Implementing the CPU Backend
+Implementing the CPU Back-end
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Let's start by trying to implement a naive and generic version of 
-:meth:`Axpby::eval_cpu`. We declared this as a private member function of 
-:class:`Axpby` earlier called :meth:`Axpby::eval`. 
+Let's start by implementing a naive and generic version of
+:meth:`Axpby::eval_cpu`. We declared this as a private member function of
+:class:`Axpby` earlier called :meth:`Axpby::eval`.

-Our naive method will go over each element of the output array, find the 
-corresponding input elements of ``x`` and ``y`` and perform the operation 
-pointwise. This is captured in the templated function :meth:`axpby_impl`. 
+Our naive method will go over each element of the output array, find the
+corresponding input elements of ``x`` and ``y`` and perform the operation
+point-wise. This is captured in the templated function :meth:`axpby_impl`.

 .. code-block:: C++

@@ -284,31 +266,31 @@ pointwise. This is captured in the templated function :meth:`axpby_impl`.
        T alpha = static_cast<T>(alpha_);
        T beta = static_cast<T>(beta_);

-        // Do the elementwise operation for each output
+        // Do the element-wise operation for each output
        for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
            // Map linear indices to offsets in x and y
            auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
            auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());

            // We allocate the output to be contiguous and regularly strided
-            // (defaults to row major) and hence it doesn't need additonal mapping
+            // (defaults to row major) and hence it doesn't need additional mapping
            out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
        }
    }

-Now, we would like our implementation to be able to do this pointwise operation 
-for all incoming floating point arrays. Accordingly, we add dispatches for 
-``float32``, ``float16``, ``bfloat16`` and ``complex64``. We throw an error 
-if we encounter an unexpected type.
+Our implementation should work for all incoming floating point arrays.
+Accordingly, we add dispatches for ``float32``, ``float16``, ``bfloat16`` and
+``complex64``. We throw an error if we encounter an unexpected type.

 .. code-block:: C++

    /** Fall back implementation for evaluation on CPU */
-    void Axpby::eval(const std::vector<array>& inputs, array& out) {
-        // Check the inputs (registered in the op while contructing the out array)
-        assert(inputs.size() == 2);
+    void Axpby::eval(
+      const std::vector<array>& inputs,
+      const std::vector<array>& outputs) {
        auto& x = inputs[0];
        auto& y = inputs[1];
+        auto& out = outputs[0];

        // Dispatch to the correct dtype
        if (out.dtype() == float32) {
@@ -321,28 +303,26 @@ if we encounter an unexpected type.
            return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
        } else {
            throw std::runtime_error(
-                "Axpby is only supported for floating point types.");
+                "[Axpby] Only supports floating point types.");
        }
    }

-We have a fallback implementation! Now, to do what we are really here to do. 
-Remember we wanted to use the ``axpby`` routine provided by the Accelerate_
-framework? Well, there are 3 complications to keep in mind:
+This is good as a fallback implementation. We can use the ``axpby`` routine
+provided by the Accelerate_ framework for a faster implementation in certain
+cases:

 #.  Accelerate does not provide implementations of ``axpby`` for half precision
-    floats. We can only direct to it for ``float32`` types 
-#.  Accelerate assumes the inputs ``x`` and ``y`` are contiguous and all elements
-    have fixed strides between them. Possibly due to broadcasts and transposes, 
-    we aren't guaranteed that the inputs fit this requirement. We can 
-    only direct to Accelerate if both ``x`` and ``y`` are row contiguous or 
-    column contiguous. 
-#.  Accelerate performs the routine ``Y = (alpha * X) + (beta * Y)`` inplace. 
-    MLX expects to write out the answer to a new array. We must copy the elements 
-    of ``y`` into the output array and use that as an input to ``axpby``
+    floats. We can only use it for ``float32`` types.
+#.  Accelerate assumes the inputs ``x`` and ``y`` are contiguous and all
+    elements have fixed strides between them. We only direct to Accelerate
+    if both ``x`` and ``y`` are row contiguous or column contiguous.
+#.  Accelerate performs the routine ``Y = (alpha * X) + (beta * Y)`` in-place.
+    MLX expects to write the output to a new array. We must copy the elements
+    of ``y`` into the output and use that as an input to ``axpby``.

-Let's write out an implementation that uses Accelerate in the right conditions. 
-It must simply allocate data for the output, copy elements of ``y`` into it, 
-and then call the :meth:`catlas_saxpby` from accelerate. 
+Let's write an implementation that uses Accelerate in the right conditions.
+It allocates data for the output, copies ``y`` into it, and then calls the
+:func:`catlas_saxpby` from accelerate.

 .. code-block:: C++

@@ -356,17 +336,7 @@ and then call the :meth:`catlas_saxpby` from accelerate.
        // Accelerate library provides catlas_saxpby which does
        // Y = (alpha * X) + (beta * Y) in place
        // To use it, we first copy the data in y over to the output array
-
-        // This specialization requires both x and y be contiguous in the same mode
-        // i.e: corresponding linear indices in both point to corresponding elements
-        // The data in the output array is allocated to match the strides in y
-        // such that x, y, and out are contiguous in the same mode and
-        // no transposition is needed
-        out.set_data(
-            allocator::malloc_or_wait(y.data_size() * out.itemsize()),
-            y.data_size(),
-            y.strides(),
-            y.flags());
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));

        // We then copy over the elements using the contiguous vector specialization
        copy_inplace(y, out, CopyType::Vector);
@@ -389,18 +359,20 @@ and then call the :meth:`catlas_saxpby` from accelerate.
            /* INCY = */ 1);
    }

-Great! But what about the inputs that do not fit the criteria for accelerate?
-Luckily, we can always just direct back to :meth:`Axpby::eval`.
-
-With this in mind, lets finally implement our :meth:`Axpby::eval_cpu`.
+For inputs that do not fit the criteria for accelerate, we fall back to
+:meth:`Axpby::eval`. With this in mind, let's finish our
+:meth:`Axpby::eval_cpu`.

 .. code-block:: C++

    /** Evaluate primitive on CPU using accelerate specializations */
-    void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+    void Axpby::eval_cpu(
+      const std::vector<array>& inputs,
+      const std::vector<array>& outputs) {
        assert(inputs.size() == 2);
        auto& x = inputs[0];
        auto& y = inputs[1];
+        auto& out = outputs[0];

        // Accelerate specialization for contiguous single precision float arrays
        if (out.dtype() == float32 &&
@@ -410,35 +382,33 @@ With this in mind, lets finally implement our :meth:`Axpby::eval_cpu`.
            return;
        }

-        // Fall back to common backend if specializations are not available
-        eval(inputs, out);
+        // Fall back to common back-end if specializations are not available
+        eval(inputs, outputs);
    }

-We have now hit a milestone! Just this much is enough to run the operation 
-:meth:`axpby` on a CPU stream! 
+Just this much is enough to run the operation :meth:`axpby` on a CPU stream! If
+you do not plan on running the operation on the GPU or using transforms on
+computation graphs that contain :class:`Axpby`, you can stop implementing the
+primitive here and enjoy the speed-ups you get from the Accelerate library.

-If you do not plan on running the operation on the GPU or using transforms on 
-computation graphs that contain :class:`Axpby`, you can stop implementing the 
-primitive here and enjoy the speed-ups you get from the Accelerate library. 
-
-Implementing the GPU Backend
+Implementing the GPU Back-end
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Apple silicon devices address their GPUs using the Metal_ shading language, and 
-all GPU kernels in MLX are written using metal. 
+Apple silicon devices address their GPUs using the Metal_ shading language, and
+GPU kernels in MLX are written using Metal.

 .. note::

-    Here are some helpful resources if you are new to metal!
+    Here are some helpful resources if you are new to Metal:

    * A walkthrough of the metal compute pipeline: `Metal Example`_
    * Documentation for metal shading language: `Metal Specification`_
    * Using metal from C++: `Metal-cpp`_

-Let's keep the GPU algorithm simple. We will launch exactly as many threads 
-as there are elements in the output. Each thread will pick the element it needs 
-from ``x`` and ``y``, do the pointwise operation, and then update its assigned 
-element in the output. 
+Let's keep the GPU kernel simple. We will launch exactly as many threads as
+there are elements in the output. Each thread will pick the element it needs
+from ``x`` and ``y``, do the point-wise operation, and update its assigned
+element in the output.

 .. code-block:: C++

@@ -450,67 +420,44 @@ element in the output.
            constant const float& alpha [[buffer(3)]],
            constant const float& beta [[buffer(4)]],
            constant const int* shape [[buffer(5)]],
-            constant const size_t* x_strides [[buffer(6)]],
-            constant const size_t* y_strides [[buffer(7)]],
+            constant const int64_t* x_strides [[buffer(6)]],
+            constant const int64_t* y_strides [[buffer(7)]],
            constant const int& ndim [[buffer(8)]],
            uint index [[thread_position_in_grid]]) {
        // Convert linear indices to offsets in array
        auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
        auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
-        
+
        // Do the operation and update the output
-        out[index] = 
+        out[index] =
            static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
    }

 We then need to instantiate this template for all floating point types and give
-each instantiation a unique host name so we can identify the right kernel for 
-each data type. 
+each instantiation a unique host name so we can identify it.

 .. code-block:: C++

-    #define instantiate_axpby(type_name, type)              \
-        template [[host_name("axpby_general_" #type_name)]] \
-        [[kernel]] void axpby_general<type>(                \
-            device const type* x [[buffer(0)]],             \
-            device const type* y [[buffer(1)]],             \
-            device type* out [[buffer(2)]],                 \
-            constant const float& alpha [[buffer(3)]],      \
-            constant const float& beta [[buffer(4)]],       \
-            constant const int* shape [[buffer(5)]],        \
-            constant const size_t* x_strides [[buffer(6)]], \
-            constant const size_t* y_strides [[buffer(7)]], \
-            constant const int& ndim [[buffer(8)]],         \
-            uint index [[thread_position_in_grid]]);
+    instantiate_kernel("axpby_general_float32", axpby_general, float)
+    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
+    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
+    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)

-    instantiate_axpby(float32, float);
-    instantiate_axpby(float16, half);
-    instantiate_axpby(bflot16, bfloat16_t);
-    instantiate_axpby(complex64, complex64_t);
-
-This kernel will be compiled into a metal library ``mlx_ext.metallib`` as we 
-will see later in :ref:`Building with CMake`. In the following example, we 
-assume that the library ``mlx_ext.metallib`` will always be co-located with 
-the executable/ shared-library calling the :meth:`register_library` function. 
-The :meth:`register_library` function takes the library's name and potential 
-path (or in this case, a function that can produce the path of the metal 
-library) and tries to load that library if it hasn't already been registered 
-by the relevant static :class:`mlx::core::metal::Device` object. This is why, 
-it is important to package your C++ library with the metal library. We will 
-go over this process in more detail later. 
-
-The logic to determine the kernel, set the inputs, resolve the grid dimensions 
-and dispatch it to the GPU are contained in :meth:`Axpby::eval_gpu` as shown 
+The logic to determine the kernel, set the inputs, resolve the grid dimensions,
+and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown
 below.

 .. code-block:: C++

    /** Evaluate primitive on GPU */
-    void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+    void Axpby::eval_gpu(
+      const std::vector<array>& inputs,
+      std::vector<array>& outputs) {
        // Prepare inputs
        assert(inputs.size() == 2);
        auto& x = inputs[0];
        auto& y = inputs[1];
+        auto& out = outputs[0];

        // Each primitive carries the stream it should execute on
        // and each stream carries its device identifiers
@@ -518,45 +465,44 @@ below.
        // We get the needed metal device using the stream
        auto& d = metal::device(s.device);

-        // Allocate output memory 
+        // Allocate output memory
        out.set_data(allocator::malloc_or_wait(out.nbytes()));

-        // Resolve name of kernel (corresponds to axpby.metal)
+        // Resolve name of kernel
        std::ostringstream kname;
        kname << "axpby_" << "general_" << type_to_name(out);

-        // Make sure the metal library is available and look for it
-        // in the same folder as this executable if needed
-        d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
+        // Make sure the metal library is available
+        d.register_library("mlx_ext");

        // Make a kernel from this metal library
        auto kernel = d.get_kernel(kname.str(), "mlx_ext");

        // Prepare to encode kernel
-        auto compute_encoder = d.get_command_encoder(s.index);
-        compute_encoder->setComputePipelineState(kernel);
+        auto& compute_encoder = d.get_command_encoder(s.index);
+        compute_encoder.set_compute_pipeline_state(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
-        // those in the kernel decelaration at axpby.metal
+        // those in the kernel declaration at axpby.metal
        int ndim = out.ndim();
        size_t nelem = out.size();

        // Encode input arrays to kernel
-        set_array_buffer(compute_encoder, x, 0);
-        set_array_buffer(compute_encoder, y, 1);
+        compute_encoder.set_input_array(x, 0);
+        compute_encoder.set_input_array(y, 1);

        // Encode output arrays to kernel
-        set_array_buffer(compute_encoder, out, 2);
+        compute_encoder.set_output_array(out, 2);

        // Encode alpha and beta
-        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
-        compute_encoder->setBytes(&beta_, sizeof(float), 4);
+        compute_encoder.set_bytes(alpha_, 3);
+        compute_encoder.set_bytes(beta_, 4);

-        // Encode shape, strides and ndim 
-        compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
-        compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
-        compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
-        compute_encoder->setBytes(&ndim, sizeof(int), 8);
+        // Encode shape, strides and ndim
+        compute_encoder.set_vector_bytes(x.shape(), 5);
+        compute_encoder.set_vector_bytes(x.strides(), 6);
+        compute_encoder.set_bytes(y.strides(), 7);
+        compute_encoder.set_bytes(ndim, 8);

        // We launch 1 thread for each input and make sure that the number of
        // threads in any given threadgroup is not higher than the max allowed
@@ -568,41 +514,38 @@ below.
        // Fix the 3D size of the launch grid (in terms of threads)
        MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

-        // Launch the grid with the given number of threads divded among
+        // Launch the grid with the given number of threads divided among
        // the given threadgroups
-        compute_encoder->dispatchThreads(grid_dims, group_dims);
+        compute_encoder.dispatch_threads(grid_dims, group_dims);
    }

 We can now call the :meth:`axpby` operation on both the CPU and the GPU!

-A few things to note about MLX and metal before moving on. MLX keeps track 
-of the active ``compute_encoder``. We rely on :meth:`d.get_command_encoder` 
-to give us the active metal compute command encoder instead of building a 
-new one and calling :meth:`compute_encoder->end_encoding` at the end. 
-MLX keeps adding kernels (compute pipelines) to the active command encoder 
-until some specified limit is hit or the compute encoder needs to be flushed 
-for synchronization. MLX also handles enqueuing and commiting the associated 
-command buffers as needed. We suggest taking a deeper dive into 
-:class:`metal::Device` if you would like to study this routine further.
+A few things to note about MLX and Metal before moving on. MLX keeps track of
+the active ``command_buffer`` and the ``MTLCommandBuffer`` to which it is
+associated. We rely on :meth:`d.get_command_encoder` to give us the active
+metal compute command encoder instead of building a new one and calling
+:meth:`compute_encoder->end_encoding` at the end. MLX adds kernels (compute
+pipelines) to the active command buffer until some specified limit is hit or
+the command buffer needs to be flushed for synchronization.

 Primitive Transforms
 ^^^^^^^^^^^^^^^^^^^^^

-Now that we have come this far, let's also learn how to add implementations to 
-transformations in a :class:`Primitive`. These transformations can be built on 
-top of our operations, including the one we just defined now. Which then gives 
-us the following :meth:`Axpby::jvp` and :meth:`Axpby::vjp` implementations.
+Next, let's add implementations for transformations in a :class:`Primitive`.
+These transformations can be built on top of other operations, including the
+one we just defined:

 .. code-block:: C++

    /** The Jacobian-vector product. */
-    array Axpby::jvp(
+    std::vector<array> Axpby::jvp(
            const std::vector<array>& primals,
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) {
        // Forward mode diff that pushes along the tangents
-        // The jvp transform on the the primitive can built with ops
-        // that are scheduled on the same stream as the primtive
+        // The jvp transform on the primitive can built with ops
+        // that are scheduled on the same stream as the primitive

        // If argnums = {0}, we only push along x in which case the
        // jvp is just the tangent scaled by alpha
@@ -611,12 +554,12 @@ us the following :meth:`Axpby::jvp` and :meth:`Axpby::vjp` implementations.
        if (argnums.size() > 1) {
            auto scale = argnums[0] == 0 ? alpha_ : beta_;
            auto scale_arr = array(scale, tangents[0].dtype());
-            return multiply(scale_arr, tangents[0], stream());
+            return {multiply(scale_arr, tangents[0], stream())};
        }
        // If, argnums = {0, 1}, we take contributions from both
        // which gives us jvp = tangent_x * alpha + tangent_y * beta
        else {
-            return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+            return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
        }
    }

@@ -625,34 +568,35 @@ us the following :meth:`Axpby::jvp` and :meth:`Axpby::vjp` implementations.
    /** The vector-Jacobian product. */
    std::vector<array> Axpby::vjp(
            const std::vector<array>& primals,
-            const array& cotan,
-            const std::vector<int>& argnums) {
+            const std::vector<array>& cotangents,
+            const std::vector<int>& argnums,
+            const std::vector<int>& /* unused */) {
        // Reverse mode diff
        std::vector<array> vjps;
        for (auto arg : argnums) {
            auto scale = arg == 0 ? alpha_ : beta_;
-            auto scale_arr = array(scale, cotan.dtype());
-            vjps.push_back(multiply(scale_arr, cotan, stream()));
+            auto scale_arr = array(scale, cotangents[0].dtype());
+            vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
        }
        return vjps;
    }

-Finally, you need not have a transformation fully defined to start using your 
-own :class:`Primitive`.
+Note, a transformation does not need to be fully defined to start using
+the :class:`Primitive`.

 .. code-block:: C++

-    /** Vectorize primitve along given axis */
-    std::pair<array, int> Axpby::vmap(
+    /** Vectorize primitive along given axis */
+    std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) {
-        throw std::runtime_error("Axpby has no vmap implementation.");
+        throw std::runtime_error("[Axpby] vmap not implemented.");
    }

 Building and Binding
 --------------------

-Let's look at the overall directory structure first. 
+Let's look at the overall directory structure first.

 | extensions
 | ├── axpby
@@ -666,40 +610,39 @@ Let's look at the overall directory structure first.
 | └── setup.py

 * ``extensions/axpby/`` defines the C++ extension library
-* ``extensions/mlx_sample_extensions`` sets out the strucutre for the 
-  associated python package
-* ``extensions/bindings.cpp`` provides python bindings for our operation
-* ``extensions/CMakeLists.txt`` holds CMake rules to build the library and 
-  python bindings
+* ``extensions/mlx_sample_extensions`` sets out the structure for the
+  associated Python package
+* ``extensions/bindings.cpp`` provides Python bindings for our operation
+* ``extensions/CMakeLists.txt`` holds CMake rules to build the library and
+  Python bindings
 * ``extensions/setup.py`` holds the ``setuptools`` rules to build and install
-  the python package
+  the Python package

 Binding to Python
 ^^^^^^^^^^^^^^^^^^

-We use PyBind11_ to build a Python API for the C++ library. Since bindings 
-for all needed components such as `mlx.core.array`, `mlx.core.stream`, etc. 
-are already provided, adding our :meth:`axpby` becomes very simple!
+We use nanobind_ to build a Python API for the C++ library. Since bindings for
+components such as :class:`mlx.core.array`, :class:`mlx.core.stream`, etc. are
+already provided, adding our :meth:`axpby` is simple.

 .. code-block:: C++

-    PYBIND11_MODULE(mlx_sample_extensions, m) {
-        m.doc() = "Sample C++ and metal extensions for MLX";
+   NB_MODULE(_ext, m) {
+        m.doc() = "Sample extension for MLX";

        m.def(
            "axpby",
            &axpby,
            "x"_a,
            "y"_a,
-            py::pos_only(),
            "alpha"_a,
            "beta"_a,
-            py::kw_only(),
-            "stream"_a = py::none(),
-            R"pbdoc(
-                Scale and sum two vectors elementwise
+            nb::kw_only(),
+            "stream"_a = nb::none(),
+            R"(
+                Scale and sum two vectors element-wise
                ``z = alpha * x + beta * y``
-                
+
                Follows numpy style broadcasting between ``x`` and ``y``
                Inputs are upcasted to floats if needed

@@ -711,17 +654,17 @@ are already provided, adding our :meth:`axpby` becomes very simple!

                Returns:
                    array: ``alpha * x + beta * y``
-            )pbdoc");
+            )");
    }

-Most of the complexity in the above example comes from additional bells and 
+Most of the complexity in the above example comes from additional bells and
 whistles such as the literal names and doc-strings.

 .. warning::

-    :mod:`mlx.core` needs to be imported before importing 
-    :mod:`mlx_sample_extensions` as defined by the pybind11 module above to 
-    ensure that the casters for :mod:`mlx.core` components like 
+    :mod:`mlx.core` must be imported before importing
+    :mod:`mlx_sample_extensions` as defined by the nanobind module above to
+    ensure that the casters for :mod:`mlx.core` components like
    :class:`mlx.core.array` are available.

 .. _Building with CMake:
@@ -729,8 +672,8 @@ whistles such as the literal names and doc-strings.
 Building with CMake
 ^^^^^^^^^^^^^^^^^^^^

-Building the C++ extension library itself is simple, it only requires that you 
-``find_package(MLX CONFIG)`` and then link it to your library. 
+Building the C++ extension library only requires that you ``find_package(MLX
+CONFIG)`` and then link it to your library.

 .. code-block:: cmake

@@ -752,12 +695,12 @@ Building the C++ extension library itself is simple, it only requires that you
    # Link to mlx
    target_link_libraries(mlx_ext PUBLIC mlx)

-We also need to build the attached metal library. For convenience, we provide a 
-:meth:`mlx_build_metallib` function that builds a ``.metallib`` target given 
-sources, headers, destinations, etc. (defined in ``cmake/extension.cmake`` and 
-automatically imported with MLX package). 
+We also need to build the attached Metal library. For convenience, we provide a
+:meth:`mlx_build_metallib` function that builds a ``.metallib`` target given
+sources, headers, destinations, etc. (defined in ``cmake/extension.cmake`` and
+automatically imported with MLX package).

-Here is what that looks like in practice!
+Here is what that looks like in practice:

 .. code-block:: cmake

@@ -779,27 +722,29 @@ Here is what that looks like in practice!

    endif()

-Finally, we build the Pybind11_ bindings
+Finally, we build the nanobind_ bindings

 .. code-block:: cmake

-    pybind11_add_module(
-        mlx_sample_extensions
-        ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp
+    nanobind_add_module(
+      _ext
+      NB_STATIC STABLE_ABI LTO NOMINSIZE
+      NB_DOMAIN mlx
+      ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp
    )
-    target_link_libraries(mlx_sample_extensions PRIVATE mlx_ext)
+    target_link_libraries(_ext PRIVATE mlx_ext)

    if(BUILD_SHARED_LIBS)
-        target_link_options(mlx_sample_extensions PRIVATE -Wl,-rpath,@loader_path)
+      target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
    endif()

 Building with ``setuptools``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Once we have set out the CMake build rules as described above, we can use the
-build utilities defined in :mod:`mlx.extension` for a simple build process. 
+build utilities defined in :mod:`mlx.extension`:

-.. code-block:: python 
+.. code-block:: python

    from mlx import extension
    from setuptools import setup
@@ -809,48 +754,50 @@ build utilities defined in :mod:`mlx.extension` for a simple build process.
            name="mlx_sample_extensions",
            version="0.0.0",
            description="Sample C++ and Metal extensions for MLX primitives.",
-            ext_modules=[extension.CMakeExtension("mlx_sample_extensions")],
+            ext_modules=[extension.CMakeExtension("mlx_sample_extensions._ext")],
            cmdclass={"build_ext": extension.CMakeBuild},
-            packages = ["mlx_sample_extensions"],
-            package_dir = {"": "mlx_sample_extensions"},
-            package_data = {"mlx_sample_extensions" : ["*.so", "*.dylib", "*.metallib"]},
+            packages=["mlx_sample_extensions"],
+            package_data={"mlx_sample_extensions": ["*.so", "*.dylib", "*.metallib"]},
+            extras_require={"dev":[]},
            zip_safe=False,
-            python_requires=">=3.7",
+            python_requires=">=3.8",
        )

 .. note::
    We treat ``extensions/mlx_sample_extensions`` as the package directory
    even though it only contains a ``__init__.py`` to ensure the following:
-    
-    * :mod:`mlx.core` is always imported before importing  :mod:`mlx_sample_extensions`
-    * The C++ extension library and the metal library are co-located with the python 
-      bindings and copied together if the package is installed 

-You can build inplace for development using
+    * :mod:`mlx.core` must be imported before importing :mod:`_ext`
+    * The C++ extension library and the metal library are co-located with the python
+      bindings and copied together if the package is installed
+
+To build the package, first install the build dependencies with ``pip install
+-r requirements.txt``.  You can then build inplace for development using
 ``python setup.py build_ext -j8 --inplace`` (in ``extensions/``)

-This will result in a directory structure as follows:
+This results in the directory structure:

 | extensions
 | ├── mlx_sample_extensions
 | │   ├── __init__.py
 | │   ├── libmlx_ext.dylib # C++ extension library
 | │   ├── mlx_ext.metallib # Metal library
-| │   └── mlx_sample_extensions.cpython-3x-darwin.so # Python Binding
+| │   └── _ext.cpython-3x-darwin.so # Python Binding
 | ...

-When you try to install using the command ``python -m pip install .`` 
-(in ``extensions/``), the package will be installed with the same strucutre as 
-``extensions/mlx_sample_extensions`` and the C++ and metal library will be 
-copied along with the python binding since they are specified as ``package_data``.
+When you try to install using the command ``python -m pip install .`` (in
+``extensions/``), the package will be installed with the same structure as
+``extensions/mlx_sample_extensions`` and the C++ and Metal library will be
+copied along with the Python binding since they are specified as
+``package_data``.

 Usage
 -----

-After installing the extension as described above, you should be able to simply 
-import the python package and play with it as you would any other MLX operation!
+After installing the extension as described above, you should be able to simply
+import the Python package and play with it as you would any other MLX operation.

-Let's looks at a simple script and it's results!
+Let's look at a simple script and its results:

 .. code-block:: python

@@ -863,7 +810,7 @@ Let's looks at a simple script and it's results!

    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
-    print(f"c correctness: {mx.all(c == 6.0).item()}")
+    print(f"c correct: {mx.all(c == 6.0).item()}")

 Output:

@@ -874,12 +821,12 @@ Output:
    c correctness: True

 Results
-^^^^^^^^^^^^^^^^
+^^^^^^^

-Let's run a quick benchmark and see how our new ``axpby`` operation compares 
-with the naive :meth:`simple_axpby` we defined at first on the CPU. 
+Let's run a quick benchmark and see how our new ``axpby`` operation compares
+with the naive :meth:`simple_axpby` we first defined on the CPU.

-.. code-block:: python 
+.. code-block:: python

    import mlx.core as mx
    from mlx_sample_extensions import axpby
@@ -898,7 +845,7 @@ with the naive :meth:`simple_axpby` we defined at first on the CPU.
    alpha = 4.0
    beta = 2.0

-    mx.eval((x, y))
+    mx.eval(x, y)

    def bench(f):
        # Warm up
@@ -919,30 +866,23 @@ with the naive :meth:`simple_axpby` we defined at first on the CPU.

    print(f"Simple axpby: {simple_time:.3f} s | Custom axpby: {custom_time:.3f} s")

-Results:
+The results are ``Simple axpby: 0.114 s | Custom axpby: 0.109 s``. We see
+modest improvements right away!

-.. code-block::
-
-    Simple axpby: 0.114 s | Custom axpby: 0.109 s
-
-We see some modest improvements right away! 
-
-This operation is now good to be used to build other operations, 
-in :class:`mlx.nn.Module` calls, and also as a part of graph 
-transformations such as :meth:`grad` and :meth:`simplify`!
+This operation is now good to be used to build other operations, in
+:class:`mlx.nn.Module` calls, and also as a part of graph transformations like
+:meth:`grad`.

 Scripts
 -------

 .. admonition:: Download the code

-   The full example code is available in `mlx-examples <code>`_.
-
-.. code: `TODO_LINK/extensions`_
+   The full example code is available in `mlx <https://github.com/ml-explore/mlx/tree/main/examples/extensions/>`_.

 .. _Accelerate: https://developer.apple.com/documentation/accelerate/blas?language=objc
 .. _Metal: https://developer.apple.com/documentation/metal?language=objc
 .. _Metal-cpp: https://developer.apple.com/metal/cpp/
 .. _`Metal Specification`: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
 .. _`Metal Example`: https://developer.apple.com/documentation/metal/performing_calculations_on_a_gpu?language=objc
-.. _PyBind11: https://pybind11.readthedocs.io/en/stable/
+.. _nanobind: https://nanobind.readthedocs.io/en/latest/
--- a/docs/src/dev/metal_debugger.rst
+++ b/docs/src/dev/metal_debugger.rst
@@ -0,0 +1,68 @@
+Metal Debugger
+==============
+
+.. currentmodule:: mlx.core
+
+Profiling is a key step for performance optimization. You can build MLX with
+the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and
+optimization workflow. The ``MLX_METAL_DEBUG`` debug option:
+
+* Records source during Metal compilation, for later inspection while
+  debugging.
+* Labels Metal objects such as command queues, improving capture readability.
+
+To build with debugging enabled in Python prepend
+``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call.
+
+The :func:`metal.start_capture` function initiates a capture of all MLX GPU
+work.
+
+.. note::
+
+   To capture a GPU trace you must run the application with
+   ``MTL_CAPTURE_ENABLED=1``.
+
+.. code-block:: python
+
+    import mlx.core as mx
+
+    a = mx.random.uniform(shape=(512, 512))
+    b = mx.random.uniform(shape=(512, 512))
+    mx.eval(a, b)
+
+    trace_file = "mlx_trace.gputrace"
+
+    # Make sure to run with MTL_CAPTURE_ENABLED=1 and
+    # that the path trace_file does not already exist.
+    mx.metal.start_capture(trace_file)
+
+    for _ in range(10):
+      mx.eval(mx.add(a, b))
+
+    mx.metal.stop_capture()
+
+You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
+has a great overview of all operations. Checkout the `Metal debugger
+documentation`_ for more information.
+
+.. image:: ../_static/metal_debugger/capture.png
+    :class: dark-light
+
+Xcode Workflow
+--------------
+
+You can skip saving to a path by running within Xcode. First, generate an
+Xcode project using CMake.
+
+.. code-block::
+
+    mkdir build && cd build
+    cmake .. -DMLX_METAL_DEBUG=ON -G Xcode
+    open mlx.xcodeproj
+
+Select the ``metal_capture`` example schema and run.
+
+.. image:: ../_static/metal_debugger/schema.png
+    :class: dark-light
+
+.. _`Metal debugger documentation`: https://developer.apple.com/documentation/xcode/metal-debugger
--- a/docs/src/dev/mlx_in_cpp.rst
+++ b/docs/src/dev/mlx_in_cpp.rst
@@ -0,0 +1,121 @@
+.. _mlx_in_cpp:
+
+Using MLX in C++
+================
+
+You can use MLX in a C++ project with CMake.
+
+.. note::
+
+  This guide is based one the following `example using MLX in C++ 
+  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_
+
+First install MLX:
+
+.. code-block:: bash
+
+  pip install -U mlx
+
+You can also install the MLX Python package from source or just the C++
+library. For more information see the :ref:`documentation on installing MLX
+<build_and_install>`.
+
+Next make an example program in ``example.cpp``: 
+
+.. code-block:: C++
+
+  #include <iostream>
+
+  #include "mlx/mlx.h"
+
+  namespace mx = mlx::core;
+
+  int main() {
+    auto x = mx::array({1, 2, 3});
+    auto y = mx::array({1, 2, 3});
+    std::cout << x + y << std::endl;
+    return 0;
+  }
+
+The next step is to setup a CMake file in ``CMakeLists.txt``:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.27)
+
+  project(example LANGUAGES CXX)
+
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
+Depending on how you installed MLX, you may need to tell CMake where to
+find it. 
+
+If you installed MLX with Python, then add the following to the CMake file:
+
+.. code-block:: cmake
+
+  find_package(
+    Python 3.9
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE MLX_ROOT)
+
+If you installed the MLX C++ package to a system path, then CMake should be
+able to find it. If you installed it to a non-standard location or CMake can't
+find MLX then set ``MLX_ROOT`` to the location where MLX is installed:
+
+.. code-block:: cmake
+
+  set(MLX_ROOT "/path/to/mlx/")
+
+Next, instruct CMake to find MLX:
+
+.. code-block:: cmake
+
+  find_package(MLX CONFIG REQUIRED)
+
+Finally, add the ``example.cpp`` program as an executable and link MLX.
+
+.. code-block:: cmake
+
+  add_executable(example example.cpp)
+  target_link_libraries(example PRIVATE mlx)
+
+You can build the example with:
+
+.. code-block:: bash
+
+  cmake -B build -DCMAKE_BUILD_TYPE=Release
+  cmake --build build
+
+And run it with:
+
+.. code-block:: bash
+
+  ./build/example
+
+Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:
+
+.. list-table:: Package Variables
+   :widths: 20 20 
+   :header-rows: 1
+
+   * - Variable 
+     - Description 
+   * - MLX_FOUND
+     - ``True`` if MLX is found
+   * - MLX_INCLUDE_DIRS
+     - Include directory
+   * - MLX_LIBRARIES
+     - Libraries to link against
+   * - MLX_CXX_FLAGS
+     - Additional compiler flags
+   * - MLX_BUILD_ACCELERATE
+     - ``True`` if MLX was built with Accelerate 
+   * - MLX_BUILD_METAL
+     - ``True`` if MLX was built with Metal
--- a/docs/src/examples/llama-inference.rst
+++ b/docs/src/examples/llama-inference.rst
@@ -15,7 +15,7 @@ module to concisely define the model architecture.
 Attention layer
 ^^^^^^^^^^^^^^^^

-We will start with the llama attention layer which notably uses the RoPE
+We will start with the Llama attention layer which notably uses the RoPE
 positional encoding. [1]_ In addition, our attention layer will optionally use a
 key/value cache that will be concatenated with the provided keys and values to
 support efficient inference.
@@ -371,7 +371,7 @@ Scripts

   The full example code is available in `mlx-examples`_.

-.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llama
+.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama

 .. [1] Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B. and Liu, Y., 2021.
   Roformer: Enhanced transformer with rotary position embedding. arXiv
--- a/docs/src/examples/mlp.rst
+++ b/docs/src/examples/mlp.rst
@@ -61,7 +61,10 @@ set:
  def eval_fn(model, X, y):
      return mx.mean(mx.argmax(model(X), axis=1) == y)

-Next, setup the problem parameters and load the data:
+Next, setup the problem parameters and load the data. To load the data, you need our
+`mnist data loader
+<https://github.com/ml-explore/mlx-examples/blob/main/mnist/mnist.py>`_, which
+we will import as ``mnist``.

 .. code-block:: python

--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -19,7 +19,7 @@ The main differences between MLX and NumPy are:

 The design of MLX is inspired by frameworks like `PyTorch
 <https://pytorch.org/>`_, `Jax <https://github.com/google/jax>`_, and
-`ArrayFire <https://arrayfire.org/>`_. A noteable difference from these
+`ArrayFire <https://arrayfire.org/>`_. A notable difference from these
 frameworks and MLX is the *unified memory model*. Arrays in MLX live in shared
 memory. Operations on MLX arrays can be performed on any of the supported
 device types without performing data copies. Currently supported device types
@@ -35,8 +35,17 @@ are the CPU and GPU.
   :caption: Usage 
   :maxdepth: 1

-   quick_start
-   using_streams
+   usage/quick_start
+   usage/lazy_evaluation
+   usage/unified_memory
+   usage/indexing
+   usage/saving_and_loading
+   usage/function_transforms
+   usage/compile
+   usage/numpy
+   usage/distributed
+   usage/using_streams
+   usage/export

 .. toctree::
   :caption: Examples
@@ -51,13 +60,19 @@ are the CPU and GPU.
   :maxdepth: 1

   python/array
+   python/data_types
   python/devices_and_streams
+   python/export
   python/ops
   python/random
   python/transforms
+   python/fast
   python/fft
+   python/linalg
+   python/metal
   python/nn
   python/optimizers
+   python/distributed
   python/tree_utils

 .. toctree::
@@ -71,3 +86,6 @@ are the CPU and GPU.
   :maxdepth: 1

   dev/extensions
+   dev/metal_debugger
+   dev/custom_metal_kernels
+   dev/mlx_in_cpp
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,8 +1,10 @@
+.. _build_and_install:
+
 Build and Install
 =================

-Install from PyPI
-----------------
+Python Installation
+-------------------

 MLX is available on PyPI. All you have to do to use MLX with your own Apple
 silicon computer is
@@ -11,9 +13,40 @@ silicon computer is

    pip install mlx

+To install from PyPI you must meet the following requirements:
+
+- Using an M series chip (Apple silicon)
+- Using a native Python >= 3.9
+- macOS >= 13.5
+
 .. note::
-    MLX is only available on devices running MacOS >= 13.3 
-    It is highly recommended to use MacOS 14 (Sonoma)
+    MLX is only available on devices running macOS >= 13.5
+    It is highly recommended to use macOS 14 (Sonoma)
+
+
+MLX is also available on conda-forge. To install MLX with conda do:
+
+.. code-block:: shell
+
+   conda install conda-forge::mlx
+
+
+Troubleshooting
+^^^^^^^^^^^^^^^
+
+*My OS and Python versions are in the required range but pip still does not find
+a matching distribution.*
+
+Probably you are using a non-native Python. The output of
+
+.. code-block:: shell
+
+  python -c "import platform; print(platform.processor())"
+
+should be ``arm``. If it is ``i386`` (and you have M series machine) then you
+are using a non-native Python. Switch your Python to a native Python. A good
+way to do this is with `Conda <https://stackoverflow.com/q/65415996>`_.
+

 Build from source
 -----------------
@@ -22,9 +55,12 @@ Build Requirements
 ^^^^^^^^^^^^^^^^^^

 - A C++ compiler with C++17 support (e.g. Clang >= 5.0)
- `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
- Xcode >= 14.3 (Xcode >= 15.0 for MacOS 14 and above)
+- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
+- Xcode >= 15.0 and macOS SDK >= 14.0

+.. note::
+   Ensure your shell environment is native ``arm``, not ``x86`` via Rosetta. If
+   the output of ``uname -p`` is ``x86``, see the :ref:`troubleshooting section <build shell>` below.

 Python API
 ^^^^^^^^^^
@@ -36,33 +72,38 @@ To build and install the MLX python library from source, first, clone MLX from

   git clone git@github.com:ml-explore/mlx.git mlx && cd mlx

-Make sure that you have `pybind11 <https://pybind11.readthedocs.io/en/stable/index.html>`_
-installed. You can install ``pybind11`` with ``pip``, ``brew`` or ``conda`` as follows:
+Then simply build and install MLX using pip:

 .. code-block:: shell

-    pip install "pybind11[global]"
-    conda install pybind11
-    brew install pybind11
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install .

-Then simply build and install it using pip:
+For developing, install the package with development dependencies, and use an
+editable install:

 .. code-block:: shell

-   env CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
+  CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -e ".[dev]"

-For developing use an editable install:
+Once the development dependencies are installed, you can build faster with:

 .. code-block:: shell

-  env CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e .
+ CMAKE_BUILD_PARALLEL_LEVEL=8 python setup.py build_ext --inplace

-To make sure the install is working run the tests with:
+Run the tests with:

 .. code-block:: shell
-  pip install ".[testing]"
+
  python -m unittest discover python/tests

+Optional: Install stubs to enable auto completions and type checking from your
+IDE:
+
+.. code-block:: shell
+
+  python setup.py generate_stubs
+
 C++ API
 ^^^^^^^

@@ -81,7 +122,7 @@ Create a build directory and run CMake and make:
 .. code-block:: shell

   mkdir -p build && cd build
-   cmake .. && make -j 
+   cmake .. && make -j

 Run tests with:

@@ -100,7 +141,7 @@ directory as the executable statically linked to ``libmlx.a`` or the
 preprocessor constant ``METAL_PATH`` should be defined at build time and it
 should point to the path to the built metal library.

-.. list-table:: Build Options 
+.. list-table:: Build Options
   :widths: 25 8
   :header-rows: 1

@@ -114,23 +155,115 @@ should point to the path to the built metal library.
     - OFF
   * - MLX_BUILD_METAL
     - ON
+   * - MLX_BUILD_CPU
+     - ON
   * - MLX_BUILD_PYTHON_BINDINGS
     - OFF
-
+   * - MLX_METAL_DEBUG
+     - OFF
+   * - MLX_BUILD_SAFETENSORS
+     - ON
+   * - MLX_BUILD_GGUF
+     - ON
+   * - MLX_METAL_JIT
+     - OFF

 .. note::

-    If you have multiple Xcode installations and wish to use 
-    a specific one while building, you can do so by adding the 
-    following environment variable before building 
+    If you have multiple Xcode installations and wish to use
+    a specific one while building, you can do so by adding the
+    following environment variable before building

    .. code-block:: shell

      export DEVELOPER_DIR="/path/to/Xcode.app/Contents/Developer/"

-    Further, you can use the following command to find out which 
-    MacOS SDK will be used
+    Further, you can use the following command to find out which
+    macOS SDK will be used

    .. code-block:: shell

      xcrun -sdk macosx --show-sdk-version
+
+Binary Size Minimization
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To produce a smaller binary use the CMake flags ``CMAKE_BUILD_TYPE=MinSizeRel``
+and ``BUILD_SHARED_LIBS=ON``.
+
+The MLX CMake build has several additional options to make smaller binaries.
+For example, if you don't need the CPU backend or support for safetensors and
+GGUF, you can do:
+
+.. code-block:: shell
+
+  cmake .. \
+    -DCMAKE_BUILD_TYPE=MinSizeRel \
+    -DBUILD_SHARED_LIBS=ON \
+    -DMLX_BUILD_CPU=OFF \
+    -DMLX_BUILD_SAFETENSORS=OFF \
+    -DMLX_BUILD_GGUF=OFF \
+    -DMLX_METAL_JIT=ON
+
+THE ``MLX_METAL_JIT`` flag minimizes the size of the MLX Metal library which
+contains pre-built GPU kernels. This substantially reduces the size of the
+Metal library by run-time compiling kernels the first time they are used in MLX
+on a given machine. Note run-time compilation incurs a cold-start cost which can
+be anwywhere from a few hundred millisecond to a few seconds depending on the
+application. Once a kernel is compiled, it will be cached by the system. The
+Metal kernel cache persists across reboots.
+
+Troubleshooting
+^^^^^^^^^^^^^^^
+
+Metal not found
+~~~~~~~~~~~~~~~
+
+You see the following error when you try to build:
+
+.. code-block:: shell
+
+  error: unable to find utility "metal", not a developer tool or in PATH
+
+To fix this, first make sure you have Xcode installed:
+
+.. code-block:: shell
+
+  xcode-select --install
+
+Then set the active developer directory:
+
+.. code-block:: shell
+
+  sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
+
+x86 Shell
+~~~~~~~~~
+
+.. _build shell:
+
+If the output of ``uname -p``  is ``x86`` then your shell is running as x86 via
+Rosetta instead of natively.
+
+To fix this, find the application in Finder (``/Applications`` for iTerm,
+``/Applications/Utilities`` for Terminal), right-click, and click “Get Info”.
+Uncheck “Open using Rosetta”, close the “Get Info” window, and restart your
+terminal.
+
+Verify the terminal is now running natively the following command:
+
+.. code-block:: shell
+
+  $ uname -p
+  arm
+
+Also check that cmake is using the correct architecture:
+
+.. code-block:: shell
+
+  $ cmake --system-information | grep CMAKE_HOST_SYSTEM_PROCESSOR
+  CMAKE_HOST_SYSTEM_PROCESSOR "arm64"
+
+If you see ``"x86_64"``, try re-installing ``cmake``. If you see ``"arm64"``
+but the build errors out with "Building for x86_64 on macOS is not supported."
+wipe your build cache with ``rm -rf build/`` and try again.
--- a/docs/src/python/array.rst
+++ b/docs/src/python/array.rst
@@ -10,36 +10,53 @@ Array

    array
    array.astype
+    array.at
    array.item
    array.tolist
    array.dtype
+    array.itemsize
+    array.nbytes
    array.ndim
    array.shape
    array.size
-    Dtype
    array.abs
    array.all
    array.any
    array.argmax
    array.argmin
+    array.conj
    array.cos
-    array.dtype
+    array.cummax
+    array.cummin
+    array.cumprod
+    array.cumsum
+    array.diag
+    array.diagonal
    array.exp
+    array.flatten
    array.log
+    array.log10
    array.log1p
+    array.log2
    array.logsumexp
    array.max
    array.mean
    array.min
+    array.moveaxis
    array.prod
    array.reciprocal
    array.reshape
+    array.round
    array.rsqrt
    array.sin
    array.split
    array.sqrt
    array.square
+    array.squeeze
+    array.std
    array.sum
+    array.swapaxes
    array.transpose
    array.T
    array.var
+    array.view
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -1,7 +1,5 @@
 .. _data_types:

-:orphan:
-
 Data Types
 ==========

@@ -29,9 +27,9 @@ The default floating point type is ``float32`` and the default integer type is
   * - ``uint32``
     - 4 
     - 32-bit unsigned integer 
-   * - ``uint32``
+   * - ``uint64``
     - 8 
-     - 32-bit unsigned integer 
+     - 64-bit unsigned integer 
   * - ``int8``
     - 1 
     - 8-bit signed integer 
@@ -44,9 +42,28 @@ The default floating point type is ``float32`` and the default integer type is
   * - ``int64``
     - 8 
     - 64-bit signed integer 
+   * - ``bfloat16``
+     - 2 
+     - 16-bit brain float (e8, m7)
   * - ``float16``
     - 2 
-     - 16-bit float, only available with `ARM C language extensions <https://developer.arm.com/documentation/101028/0012/3--C-language-extensions?lang=en>`_
+     - 16-bit IEEE float (e5, m10)
   * - ``float32``
     - 4 
     - 32-bit float
+   * - ``complex64``
+     - 8 
+     - 64-bit complex float
+
+
+Data type are aranged in a hierarchy. See the :obj:`DtypeCategory` object
+documentation for more information. Use :func:`issubdtype` to determine if one
+``dtype`` (or category) is a subtype of another category.
+
+.. autosummary::
+   :toctree: _autosummary
+
+   Dtype
+   DtypeCategory
+   issubdtype
+   finfo
--- a/docs/src/python/devices_and_streams.rst
+++ b/docs/src/python/devices_and_streams.rst
@@ -9,9 +9,11 @@ Devices and Streams
  :toctree: _autosummary

   Device
+   Stream
   default_device
   set_default_device
-   Stream
   default_stream
   new_stream
   set_default_stream
+   stream
+   synchronize
--- a/docs/src/python/distributed.rst
+++ b/docs/src/python/distributed.rst
@@ -0,0 +1,22 @@
+.. _distributed:
+
+.. currentmodule:: mlx.core.distributed
+
+Distributed Communication
+==========================
+
+MLX provides a distributed communication package using MPI. The MPI library is
+loaded at runtime; if MPI is available then distributed communication is also
+made available.
+
+.. autosummary::
+   :toctree: _autosummary
+
+    Group
+    is_available
+    init
+    all_sum
+    all_gather
+    send
+    recv
+    recv_like
--- a/docs/src/python/export.rst
+++ b/docs/src/python/export.rst
@@ -0,0 +1,14 @@
+.. _export:
+
+Export Functions
+================
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+   export_function
+   import_function
+   exporter
+   export_to_dot
--- a/docs/src/python/fast.rst
+++ b/docs/src/python/fast.rst
@@ -0,0 +1,15 @@
+.. _fast:
+
+Fast
+====
+
+.. currentmodule:: mlx.core.fast
+
+.. autosummary:: 
+  :toctree: _autosummary
+
+  rms_norm
+  layer_norm
+  rope
+  scaled_dot_product_attention
+  metal_kernel
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -0,0 +1,20 @@
+.. _linalg:
+
+Linear Algebra
+==============
+
+.. currentmodule:: mlx.core.linalg
+
+.. autosummary:: 
+   :toctree: _autosummary 
+
+    inv
+    tri_inv
+    norm
+    cholesky
+    cholesky_inv
+    cross
+    qr
+    svd
+    eigvalsh
+    eigh
--- a/docs/src/python/metal.rst
+++ b/docs/src/python/metal.rst
@@ -0,0 +1,20 @@
+Metal
+=====
+
+.. currentmodule:: mlx.core.metal
+
+.. autosummary::
+  :toctree: _autosummary
+
+  is_available
+  device_info
+  get_active_memory
+  get_peak_memory
+  reset_peak_memory
+  get_cache_memory
+  set_memory_limit
+  set_cache_limit
+  set_wired_limit
+  clear_cache
+  start_capture
+  stop_capture
--- a/docs/src/python/nn.rst
+++ b/docs/src/python/nn.rst
@@ -64,7 +64,6 @@ Quick Start with Neural Networks
    # gradient with respect to `mlp.trainable_parameters()`
    loss_and_grad = nn.value_and_grad(mlp, l2_loss)

-
 .. _module_class:

 The Module Class
@@ -86,20 +85,58 @@ name should not start with ``_``). It can be arbitrarily nested in other
 :meth:`Module.parameters` can be used to extract a nested dictionary with all
 the parameters of a module and its submodules.

-A :class:`Module` can also keep track of "frozen" parameters.
-:meth:`Module.trainable_parameters` returns only the subset of
-:meth:`Module.parameters` that is not frozen. When using
-:meth:`mlx.nn.value_and_grad` the gradients returned will be with respect to these
-trainable parameters.
+A :class:`Module` can also keep track of "frozen" parameters. See the
+:meth:`Module.freeze` method for more details. :meth:`mlx.nn.value_and_grad`
+the gradients returned will be with respect to these trainable parameters.

-Updating the parameters
+
+Updating the Parameters
 ^^^^^^^^^^^^^^^^^^^^^^^

 MLX modules allow accessing and updating individual parameters. However, most
 times we need to update large subsets of a module's parameters. This action is
-performed by :meth:`Module.update`. 
+performed by :meth:`Module.update`.

-Value and grad
+
+Inspecting Modules
+^^^^^^^^^^^^^^^^^^
+
+The simplest way to see the model architecture is to print it. Following along with
+the above example, you can print the ``MLP`` with:
+
+.. code-block:: python
+
+  print(mlp)
+
+This will display:
+
+.. code-block:: shell
+
+  MLP(
+    (layers.0): Linear(input_dims=2, output_dims=128, bias=True)
+    (layers.1): Linear(input_dims=128, output_dims=128, bias=True)
+    (layers.2): Linear(input_dims=128, output_dims=10, bias=True)
+  )
+
+To get more detailed information on the arrays in a :class:`Module` you can use
+:func:`mlx.utils.tree_map` on the parameters. For example, to see the shapes of
+all the parameters in a :class:`Module` do:
+
+.. code-block:: python
+
+   from mlx.utils import tree_map
+   shapes = tree_map(lambda p: p.shape, mlp.parameters())
+
+As another example, you can count the number of parameters in a :class:`Module`
+with:
+
+.. code-block:: python
+
+   from mlx.utils import tree_flatten
+   num_params = sum(v.size for _, v in tree_flatten(mlp.parameters()))
+
+
+Value and Grad
 --------------

 Using a :class:`Module` does not preclude using MLX's high order function
@@ -136,37 +173,12 @@ In detail:
   :toctree: _autosummary

   value_and_grad
+   quantize

-Neural Network Layers
---------------------
+.. toctree::

-.. autosummary::
-   :toctree: _autosummary
-   :template: nn-module-template.rst
-
-   Embedding
-   ReLU
-   GELU
-   SiLU
-   Linear
-   Conv1d
-   Conv2d
-   LayerNorm
-   RMSNorm
-   GroupNorm
-   RoPE
-   MultiHeadAttention
-   Sequential
-
-Layers without parameters (e.g. activation functions) are also provided as
-simple functions.
-
-.. autosummary::
-   :toctree: _autosummary_functions
-   :template: nn-module-template.rst
-
-   gelu
-   gelu_approx
-   gelu_fast_approx
-   relu
-   silu
+   nn/module
+   nn/layers
+   nn/functions
+   nn/losses
+   nn/init
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -0,0 +1,39 @@
+.. _nn_functions:
+
+.. currentmodule:: mlx.nn
+
+Functions
+---------
+
+Layers without parameters (e.g. activation functions) are also provided as
+simple functions.
+
+.. autosummary::
+   :toctree: _autosummary_functions
+   :template: nn-module-template.rst
+
+   elu
+   celu
+   gelu
+   gelu_approx
+   gelu_fast_approx
+   glu
+   hard_shrink
+   hard_tanh
+   hardswish
+   leaky_relu
+   log_sigmoid
+   log_softmax
+   mish
+   prelu
+   relu
+   relu6
+   selu
+   sigmoid
+   silu
+   softmax
+   softmin
+   softplus
+   softshrink
+   step
+   tanh
--- a/docs/src/python/nn/init.rst
+++ b/docs/src/python/nn/init.rst
@@ -0,0 +1,45 @@
+.. _init:
+
+.. currentmodule:: mlx.nn.init
+
+Initializers
+------------
+
+The ``mlx.nn.init`` package contains commonly used initializers for neural
+network parameters. Initializers return a function which can be applied to any
+input :obj:`mlx.core.array` to produce an initialized output.
+
+For example:
+
+.. code:: python
+
+   import mlx.core as mx
+   import mlx.nn as nn
+
+   init_fn = nn.init.uniform()
+
+   # Produces a [2, 2] uniform matrix
+   param = init_fn(mx.zeros((2, 2)))
+
+To re-initialize all the parameter in an :obj:`mlx.nn.Module` from say a uniform 
+distribution, you can do:
+
+.. code:: python
+  
+   import mlx.nn as nn
+   model = nn.Sequential(nn.Linear(5, 10), nn.ReLU(), nn.Linear(10, 5))
+   init_fn = nn.init.uniform(low=-0.1, high=0.1)
+   model.apply(init_fn)
+   
+
+.. autosummary::
+   :toctree: _autosummary
+
+   constant
+   normal
+   uniform
+   identity
+   glorot_normal
+   glorot_uniform
+   he_normal
+   he_uniform
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -0,0 +1,69 @@
+.. _layers:
+
+.. currentmodule:: mlx.nn
+
+Layers
+------
+
+.. autosummary::
+   :toctree: _autosummary
+   :template: nn-module-template.rst
+
+   ALiBi
+   AvgPool1d
+   AvgPool2d
+   AvgPool3d
+   BatchNorm
+   CELU
+   Conv1d
+   Conv2d
+   Conv3d
+   ConvTranspose1d
+   ConvTranspose2d
+   ConvTranspose3d
+   Dropout
+   Dropout2d
+   Dropout3d
+   Embedding
+   ELU
+   GELU
+   GLU
+   GroupNorm
+   GRU
+   HardShrink
+   HardTanh
+   Hardswish
+   InstanceNorm
+   LayerNorm
+   LeakyReLU
+   Linear
+   LogSigmoid
+   LogSoftmax
+   LSTM
+   MaxPool1d
+   MaxPool2d
+   MaxPool3d
+   Mish
+   MultiHeadAttention
+   PReLU
+   QuantizedEmbedding
+   QuantizedLinear
+   RMSNorm
+   ReLU
+   ReLU6
+   RNN
+   RoPE
+   SELU
+   Sequential
+   Sigmoid
+   SiLU
+   SinusoidalPositionalEncoding
+   Softmin
+   Softshrink
+   Softsign
+   Softmax
+   Softplus
+   Step
+   Tanh
+   Transformer
+   Upsample
--- a/docs/src/python/nn/losses.rst
+++ b/docs/src/python/nn/losses.rst
@@ -0,0 +1,25 @@
+.. _losses:
+
+.. currentmodule:: mlx.nn.losses
+
+Loss Functions
+--------------
+
+.. autosummary::
+   :toctree: _autosummary_functions
+   :template: nn-module-template.rst
+
+   binary_cross_entropy
+   cosine_similarity_loss
+   cross_entropy
+   gaussian_nll_loss
+   hinge_loss
+   huber_loss
+   kl_div_loss
+   l1_loss
+   log_cosh_loss
+   margin_ranking_loss
+   mse_loss
+   nll_loss
+   smooth_l1_loss
+   triplet_loss
--- a/docs/src/python/nn/module.rst
+++ b/docs/src/python/nn/module.rst
@@ -1,7 +1,38 @@
-mlx.nn.Module
-=============
+Module
+======

 .. currentmodule:: mlx.nn

 .. autoclass:: Module
-   :members:
+
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.training
+      Module.state
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Module.apply
+      Module.apply_to_modules
+      Module.children
+      Module.eval
+      Module.filter_and_map
+      Module.freeze
+      Module.leaf_modules
+      Module.load_weights
+      Module.modules
+      Module.named_modules
+      Module.parameters
+      Module.save_weights
+      Module.set_dtype
+      Module.train
+      Module.trainable_parameters
+      Module.unfreeze
+      Module.update
+      Module.update_modules
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -5,13 +5,14 @@ Operations

 .. currentmodule:: mlx.core

-.. autosummary:: 
+.. autosummary::
  :toctree: _autosummary

   abs
   add
+   addmm
   all
-   allclose 
+   allclose
   any
   arange
   arccos
@@ -19,30 +20,80 @@ Operations
   arcsin
   arcsinh
   arctan
+   arctan2
   arctanh
   argmax
   argmin
   argpartition
   argsort
   array_equal
+   as_strided
+   atleast_1d
+   atleast_2d
+   atleast_3d
+   bitwise_and
+   bitwise_or
+   bitwise_xor
+   block_masked_mm
   broadcast_to
+   ceil
+   clip
   concatenate
+   conj
+   conjugate
   convolve
   conv1d
   conv2d
+   conv3d
+   conv_transpose1d
+   conv_transpose2d
+   conv_transpose3d
+   conv_general
   cos
   cosh
+   cummax
+   cummin
+   cumprod
+   cumsum
+   degrees
+   dequantize
+   diag
+   diagonal
   divide
+   divmod
+   einsum
+   einsum_path
   equal
   erf
   erfinv
   exp
+   expm1
   expand_dims
+   eye
+   flatten
+   floor
+   floor_divide
   full
+   gather_mm
+   gather_qmm
   greater
   greater_equal
+   hadamard_transform
+   identity
+   imag
+   inner
+   isfinite
+   isclose
+   isinf
+   isnan
+   isneginf
+   isposinf
+   issubdtype
+   kron
+   left_shift
   less
   less_equal
+   linspace
   load
   log
   log2
@@ -50,45 +101,79 @@ Operations
   log1p
   logaddexp
   logical_not
+   logical_and
+   logical_or
   logsumexp
   matmul
   max
   maximum
   mean
+   meshgrid
   min
   minimum
+   moveaxis
   multiply
+   nan_to_num
   negative
+   not_equal
   ones
   ones_like
+   outer
   partition
   pad
+   power
   prod
+   put_along_axis
+   quantize
+   quantized_matmul
+   radians
+   real
   reciprocal
+   remainder
+   repeat
   reshape
+   right_shift
+   roll
+   round
   rsqrt
   save
   savez
   savez_compressed
+   save_gguf
+   save_safetensors
   sigmoid
   sign
   sin
   sinh
+   slice
+   slice_update
   softmax
   sort
   split
   sqrt
   square
   squeeze
+   stack
+   std
   stop_gradient
   subtract
   sum
+   swapaxes
   take
   take_along_axis
   tan
   tanh
+   tensordot
+   tile
+   topk
+   trace
   transpose
+   tri
+   tril
+   triu
+   unflatten
   var
+   view
   where
   zeros
   zeros_like
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -1,5 +1,7 @@
 .. _optimizers:

+.. currentmodule:: mlx.optimizers
+
 Optimizers
 ==========

@@ -29,13 +31,48 @@ model's parameters and the **optimizer state**.
            # Compute the new parameters but also the optimizer state.
            mx.eval(model.parameters(), optimizer.state)

-.. currentmodule:: mlx.optimizers
+Saving and Loading
+------------------
+
+To serialize an optimizer, save its state. To load an optimizer, load and set
+the saved state. Here's a simple example:
+
+.. code-block:: python
+
+   import mlx.core as mx
+   from mlx.utils import tree_flatten, tree_unflatten
+   import mlx.optimizers as optim
+
+   optimizer = optim.Adam(learning_rate=1e-2)
+
+   # Perform some updates with the optimizer
+   model = {"w" : mx.zeros((5, 5))}
+   grads = {"w" : mx.ones((5, 5))}
+   optimizer.update(model, grads)
+
+   # Save the state
+   state = tree_flatten(optimizer.state)
+   mx.save_safetensors("optimizer.safetensors", dict(state))
+
+   # Later on, for example when loading from a checkpoint,
+   # recreate the optimizer and load the state
+   optimizer = optim.Adam(learning_rate=1e-2)
+
+   state = tree_unflatten(list(mx.load("optimizer.safetensors").items()))
+   optimizer.state = state
+
+Note, not every optimizer configuation parameter is saved in the state. For
+example, for Adam the learning rate is saved but the ``betas`` and ``eps``
+parameters are not. A good rule of thumb is if the parameter can be scheduled
+then it will be included in the optimizer state.
+
+.. toctree::
+
+   optimizers/optimizer
+   optimizers/common_optimizers
+   optimizers/schedulers

 .. autosummary::
   :toctree: _autosummary
-   :template: optimizers-template.rst

-   OptimizerState
-   Optimizer
-   SGD
-   Adam
+   clip_grad_norm
--- a/docs/src/python/optimizers/common_optimizers.rst
+++ b/docs/src/python/optimizers/common_optimizers.rst
@@ -0,0 +1,20 @@
+.. _common_optimizers:
+
+Common Optimizers
+=================
+
+.. currentmodule:: mlx.optimizers
+
+.. autosummary::
+   :toctree: _autosummary
+   :template: optimizers-template.rst
+
+   SGD
+   RMSprop
+   Adagrad
+   Adafactor
+   AdaDelta
+   Adam
+   AdamW
+   Adamax
+   Lion
--- a/docs/src/python/optimizers/optimizer.rst
+++ b/docs/src/python/optimizers/optimizer.rst
@@ -0,0 +1,23 @@
+Optimizer
+=========
+
+.. currentmodule:: mlx.optimizers
+
+.. autoclass:: Optimizer 
+
+
+   .. rubric:: Attributes
+
+   .. autosummary::
+      :toctree: _autosummary
+
+      Optimizer.state
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+      :toctree: _autosummary
+   
+      Optimizer.apply_gradients
+      Optimizer.init
+      Optimizer.update
--- a/docs/src/python/optimizers/schedulers.rst
+++ b/docs/src/python/optimizers/schedulers.rst
@@ -0,0 +1,15 @@
+.. _schedulers:
+
+Schedulers
+==========
+
+.. currentmodule:: mlx.optimizers
+
+.. autosummary::
+   :toctree: _autosummary
+
+   cosine_decay    
+   exponential_decay    
+   join_schedules
+   linear_schedule
+   step_decay    
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -33,13 +33,16 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
 .. autosummary:: 
  :toctree: _autosummary

-   seed
-   key
-   split
   bernoulli
   categorical
   gumbel
+   key
   normal
+   multivariate_normal
   randint
-   uniform
+   seed
+   split
   truncated_normal
+   uniform
+   laplace
+   permutation
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@@ -9,6 +9,10 @@ Transforms
  :toctree: _autosummary

   eval
+   compile
+   custom_function
+   disable_compile
+   enable_compile
   grad
   value_and_grad
   jvp
--- a/docs/src/python/tree_utils.rst
+++ b/docs/src/python/tree_utils.rst
@@ -19,3 +19,5 @@ return python trees will be using the default python ``dict``, ``list`` and
   tree_flatten
   tree_unflatten
   tree_map
+   tree_map_with_path
+   tree_reduce
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -0,0 +1,497 @@
+.. _compile:
+
+Compilation
+===========
+
+.. currentmodule:: mlx.core
+
+MLX has a :func:`compile` function transformation which compiles computation
+graphs. Function compilation results in smaller graphs by merging common work
+and fusing certain operations. In many cases this can lead to big improvements
+in run-time and memory use.
+
+Getting started with :func:`compile` is simple, but there are some edge cases
+that are good to be aware of for more complex graphs and advanced usage.
+
+Basics of Compile
+-----------------
+
+Let's start with a simple example:
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(-x) + y
+
+  x = mx.array(1.0)
+  y = mx.array(2.0)
+
+  # Regular call, no compilation
+  # Prints: array(2.36788, dtype=float32)
+  print(fun(x, y))
+
+  # Compile the function
+  compiled_fun = mx.compile(fun)
+
+  # Prints: array(2.36788, dtype=float32)
+  print(compiled_fun(x, y))
+
+The output of both the regular function and the compiled function is the same
+up to numerical precision.
+
+The first time you call a compiled function, MLX will build the compute
+graph, optimize it, and generate and compile code. This can be relatively
+slow. However, MLX will cache compiled functions, so calling a compiled
+function multiple times will not initiate a new compilation. This means you
+should typically compile functions that you plan to use more than once.
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(-x) + y
+
+  x = mx.array(1.0)
+  y = mx.array(2.0)
+
+  compiled_fun = mx.compile(fun)
+
+  # Compiled here
+  compiled_fun(x, y)
+
+  # Not compiled again
+  compiled_fun(x, y)
+
+  # Not compiled again
+  mx.compile(fun)(x, y)
+
+There are some important cases to be aware of that can cause a function to
+be recompiled:
+
+* Changing the shape or number of dimensions
+* Changing the type of any of the inputs
+* Changing the number of inputs to the function
+
+In certain cases only some of the compilation stack will be rerun (for
+example when changing the shapes) and in other cases the full compilation
+stack will be rerun (for example when changing the types). In general you
+should avoid compiling functions too frequently.
+
+Another idiom to watch out for is compiling functions which get created and
+destroyed frequently. This can happen, for example, when compiling an anonymous
+function in a loop:
+
+.. code-block:: python
+
+  a = mx.array(1.0)
+  # Don't do this, compiles lambda at each iteration
+  for _ in range(5):
+      mx.compile(lambda x: mx.exp(mx.abs(x)))(a)
+
+Example Speedup
+---------------
+
+The :func:`mlx.nn.gelu` is a nonlinear activation function commonly used with
+Transformer-based models. The implementation involves several unary and binary
+element-wise operations:
+
+.. code-block:: python
+
+  def gelu(x):
+      return x * (1 + mx.erf(x / math.sqrt(2))) / 2
+
+If you use this function with small arrays, it will be overhead bound. If you
+use it with large arrays it will be memory bandwidth bound.  However, all of
+the operations in the ``gelu`` are fusible into a single kernel with
+:func:`compile`. This can speedup both cases considerably.
+
+Let's compare the runtime of the regular function versus the compiled
+function. We'll use the following timing helper which does a warm up and
+handles synchronization:
+
+.. code-block:: python
+
+  import time
+
+  def timeit(fun, x):
+      # warm up
+      for _ in range(10):
+          mx.eval(fun(x))
+
+      tic = time.perf_counter()
+      for _ in range(100):
+          mx.eval(fun(x))
+      toc = time.perf_counter()
+      tpi = 1e3 * (toc - tic) / 100
+      print(f"Time per iteration {tpi:.3f} (ms)")
+
+
+Now make an array, and benchmark both functions:
+
+.. code-block:: python
+
+  x = mx.random.uniform(shape=(32, 1000, 4096))
+  timeit(nn.gelu, x)
+  timeit(mx.compile(nn.gelu), x)
+
+On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
+five times faster.
+
+Debugging
+---------
+
+When a compiled function is first called, it is traced with placeholder
+inputs. This means you can't evaluate arrays (for example to print their
+contents) inside compiled functions.
+
+.. code-block:: python
+
+  @mx.compile
+  def fun(x):
+      z = -x
+      print(z)  # Crash
+      return mx.exp(z)
+
+  fun(mx.array(5.0))
+
+For debugging, inspecting arrays can be helpful. One way to do that is to
+globally disable compilation using the :func:`disable_compile` function or
+``MLX_DISABLE_COMPILE`` flag. For example the following is okay even though
+``fun`` is compiled:
+
+.. code-block:: python
+
+  @mx.compile
+  def fun(x):
+      z = -x
+      print(z) # Okay
+      return mx.exp(z)
+
+  mx.disable_compile()
+  fun(mx.array(5.0))
+
+
+Pure Functions
+--------------
+
+Compiled functions are intended to be *pure*; that is they should not have side
+effects. For example:
+
+.. code-block:: python
+
+  state = []
+
+  @mx.compile
+  def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z)
+
+  fun(mx.array(1.0), mx.array(2.0))
+  # Crash!
+  print(state)
+
+After the first call of ``fun``, the ``state`` list will hold a placeholder
+array. The placeholder does not have any data; it is only used to build the
+computation graph. Printing such an array results in a crash.
+
+You have two options to deal with this. The first option is to simply return
+``state`` as an output:
+
+.. code-block:: python
+
+   state = []
+
+   @mx.compile
+   def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z), state
+
+    _, state = fun(mx.array(1.0), mx.array(2.0))
+    # Prints [array(3, dtype=float32)]
+    print(state)
+
+In some cases returning updated state can be pretty inconvenient. Hence,
+:func:`compile` has a parameter to capture implicit outputs:
+
+.. code-block:: python
+
+  from functools import partial
+
+  state = []
+
+  # Tell compile to capture state as an output
+  @partial(mx.compile, outputs=state)
+  def fun(x, y):
+      z = x + y
+      state.append(z)
+      return mx.exp(z), state
+
+  fun(mx.array(1.0), mx.array(2.0))
+  # Prints [array(3, dtype=float32)]
+  print(state)
+
+This is particularly useful for compiling a function which includes an update
+to a container of arrays, as is commonly done when training the parameters of a
+:class:`mlx.nn.Module`.
+
+Compiled functions will also treat any inputs not in the parameter list as
+constants. For example:
+
+.. code-block:: python
+
+  state = [mx.array(1.0)]
+
+  @mx.compile
+  def fun(x):
+      return x + state[0]
+
+  # Prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+  # Update state
+  state[0] = mx.array(5.0)
+
+  # Still prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+In order to have the change of state reflected in the outputs of ``fun`` you
+again have two options. The first option is to simply pass ``state`` as input
+to the function. In some cases this can be pretty inconvenient. Hence,
+:func:`compile` also has a parameter to capture implicit inputs:
+
+.. code-block:: python
+
+  from functools import partial
+  state = [mx.array(1.0)]
+
+  # Tell compile to capture state as an input
+  @partial(mx.compile, inputs=state)
+  def fun(x):
+      return x + state[0]
+
+  # Prints array(2, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+  # Update state
+  state[0] = mx.array(5.0)
+
+  # Prints array(6, dtype=float32)
+  print(fun(mx.array(1.0)))
+
+
+Compiling Training Graphs
+-------------------------
+
+This section will step through how to use :func:`compile` with a simple example
+of a common setup: training a model with :obj:`mlx.nn.Module` using an
+:obj:`mlx.optimizers.Optimizer` with state. We will show how to compile the
+full forward, backward, and update with :func:`compile`.
+
+To start, here is the simple example without any compilation:
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import mlx.nn as nn
+  import mlx.optimizers as optim
+
+  # 4 examples with 10 features each
+  x = mx.random.uniform(shape=(4, 10))
+
+  # 0, 1 targets
+  y = mx.array([0, 1, 0, 1])
+
+  # Simple linear model
+  model = nn.Linear(10, 1)
+
+  # SGD with momentum
+  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)
+
+  def loss_fn(model, x, y):
+      logits = model(x).squeeze()
+      return nn.losses.binary_cross_entropy(logits, y)
+
+  loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
+
+  # Perform 10 steps of gradient descent
+  for it in range(10):
+      loss, grads = loss_and_grad_fn(model, x, y)
+      optimizer.update(model, grads)
+      mx.eval(model.parameters(), optimizer.state)
+
+To compile the update we can put it all in a function and compile it with the
+appropriate input and output captures. Here's the same example but compiled:
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import mlx.nn as nn
+  import mlx.optimizers as optim
+  from functools import partial
+
+  # 4 examples with 10 features each
+  x = mx.random.uniform(shape=(4, 10))
+
+  # 0, 1 targets
+  y = mx.array([0, 1, 0, 1])
+
+  # Simple linear model
+  model = nn.Linear(10, 1)
+
+  # SGD with momentum
+  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)
+
+  def loss_fn(model, x, y):
+      logits = model(x).squeeze()
+      return nn.losses.binary_cross_entropy(logits, y)
+
+  # The state that will be captured as input and output
+  state = [model.state, optimizer.state]
+
+  @partial(mx.compile, inputs=state, outputs=state)
+  def step(x, y):
+      loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
+      loss, grads = loss_and_grad_fn(model, x, y)
+      optimizer.update(model, grads)
+      return loss
+
+  # Perform 10 steps of gradient descent
+  for it in range(10):
+      loss = step(x, y)
+      # Evaluate the model and optimizer state
+      mx.eval(state)
+      print(loss)
+
+
+.. note::
+
+  If you are using a module which performs random sampling such as
+  :func:`mlx.nn.Dropout`, make sure you also include ``mx.random.state`` in the
+  ``state`` captured by :func:`compile`, i.e. ``state = [model.state,
+  optimizer.state, mx.random.state]``.
+
+
+.. note::
+
+   For more examples of compiling full training graphs checkout the  `MLX
+   Examples <https://github.com/ml-explore/mlx-examples>`_ GitHub repo.
+
+Transformations with Compile
+----------------------------
+
+In MLX function transformations are composable. You can apply any function
+transformation to the output of any other function transformation. For more on
+this, see the documentation on :ref:`function transforms
+<function_transforms>`.
+
+Compiling transformed functions works just as expected:
+
+.. code-block:: python
+
+  grad_fn = mx.grad(mx.exp)
+
+  compiled_grad_fn = mx.compile(grad_fn)
+
+  # Prints: array(2.71828, dtype=float32)
+  print(grad_fn(mx.array(1.0)))
+
+  # Also prints: array(2.71828, dtype=float32)
+  print(compiled_grad_fn(mx.array(1.0)))
+
+.. note::
+
+   In order to compile as much as possible, a transformation of a compiled
+   function will not by default be compiled. To compile the transformed
+   function simply pass it through :func:`compile`.
+
+You can also compile functions which themselves call compiled functions. A
+good practice is to compile the outer most function to give :func:`compile`
+the most opportunity to optimize the computation graph:
+
+.. code-block:: python
+
+  @mx.compile
+  def inner(x):
+      return mx.exp(-mx.abs(x))
+
+  def outer(x):
+      inner(inner(x))
+
+  # Compiling the outer function is good to do as it will likely
+  # be faster even though the inner functions are compiled
+  fun = mx.compile(outer)
+
+
+
+.. _shapeless_compile:
+
+Shapeless Compilation
+---------------------
+
+When the shape of an input to a compiled function changes, the function is
+recompiled. You can compile a function once and run it on inputs with
+variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
+case changes to the shapes of the inputs do not cause the function to be
+recompiled.
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.abs(x + y)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.array(1.0)
+  y = mx.array(-2.0)
+
+  # Firt call compiles the function
+  print(compiled_fun(x, y))
+
+  # Second call with different shapes
+  # does not recompile the function
+  x = mx.array([1.0, -6.0])
+  y = mx.array([-2.0, 3.0])
+  print(compiled_fun(x, y))
+
+
+Use shapeless compilations carefully. Since compilation is not triggered when
+shapes change, any graphs which are conditional on the input shapes will not
+work as expected. Shape-dependent computations are common and sometimes subtle
+to detect. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.reshape(x.shape[0] * x.shape[1], -1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Error, can't reshape (5, 5, 3) to (6, -1)
+  out = compiled_fun(x)
+
+The second call to the ``compiled_fun`` fails because of the call to
+:func:`reshape` which uses the static shape of ``x`` in the first call. We can
+fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.flatten(0, 1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Ok
+  out = compiled_fun(x)
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -0,0 +1,167 @@
+.. _usage_distributed:
+
+Distributed Communication
+=========================
+
+.. currentmodule:: mlx.core.distributed
+
+MLX utilizes `MPI <https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ to
+provide distributed communication operations that allow the computational cost
+of training or inference to be shared across many physical machines. You can
+see a list of the supported operations in the :ref:`API docs<distributed>`.
+
+.. note::
+   A lot of operations may not be supported or not as fast as they should be.
+   We are adding more and tuning the ones we have as we are figuring out the
+   best way to do distributed computing on Macs using MLX.
+
+Getting Started
+---------------
+
+MLX already comes with the ability to "talk" to MPI if it is installed on the
+machine. The minimal distributed program in MLX is as simple as:
+
+.. code:: python
+
+    import mlx.core as mx
+
+    world = mx.distributed.init()
+    x = mx.distributed.all_sum(mx.ones(10))
+    print(world.rank(), x)
+
+The program above sums the array ``mx.ones(10)`` across all
+distributed processes. If simply run with ``python``, however, only one
+process is launched and no distributed communication takes place.
+
+To launch the program in distributed mode we need to use ``mpirun`` or
+``mpiexec`` depending on the MPI installation. The simplest possible way is the
+following:
+
+.. code:: shell
+
+    $ mpirun -np 2 python test.py
+    1 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
+    0 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
+
+The above launches two processes on the same (local) machine and we can see
+both standard output streams. The processes send the array of 1s to each other
+and compute the sum which is printed. Launching with ``mpirun -np 4 ...`` would
+print 4 etc.
+
+Installing MPI
+---------------
+
+MPI can be installed with Homebrew, using the Anaconda package manager or
+compiled from source. Most of our testing is done using ``openmpi`` installed
+with the Anaconda package manager as follows:
+
+.. code:: shell
+
+    $ conda install openmpi
+
+Installing with Homebrew may require specifying the location of ``libmpi.dyld``
+so that MLX can find it and load it at runtime. This can simply be achieved by
+passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun``.
+
+.. code:: shell
+
+    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python test.py
+
+Setting up Remote Hosts
+-----------------------
+
+MPI can automatically connect to remote hosts and set up the communication over
+the network if the remote hosts can be accessed via ssh. A good checklist to
+debug connectivity issues is the following:
+
+* ``ssh hostname`` works from all machines to all machines without asking for
+  password or host confirmation
+* ``mpirun`` is accessible on all machines. You can call ``mpirun`` using its
+  full path to force all machines to use a specific path.
+* Ensure that the ``hostname`` used by MPI is the one that you have configured
+  in the ``.ssh/config`` files on all machines.
+
+.. note::
+  For an example hostname ``foo.bar.com`` MPI can use only ``foo`` as
+  the hostname passed to ssh if the current hostname matches ``*.bar.com``.
+
+An easy way to pass the host names to MPI is using a host file. A host file
+looks like the following, where ``host1`` and ``host2`` should be the fully
+qualified domain names or IPs for these hosts.
+
+.. code::
+
+    host1 slots=1
+    host2 slots=1
+
+When using MLX, it is very likely that you want to use 1 slot per host, ie one
+process per host.  The hostfile also needs to contain the current
+host if you want to run on the local host. Passing the host file to
+``mpirun`` is simply done using the ``--hostfile`` command line argument.
+
+Training Example
+----------------
+
+In this section we will adapt an MLX training loop to support data parallel
+distributed training. Namely, we will average the gradients across a set of
+hosts before applying them to the model.
+
+Our training loop looks like the following code snippet if we omit the model,
+dataset and optimizer initialization.
+
+.. code:: python
+
+    model = ...
+    optimizer = ...
+    dataset = ...
+
+    def step(model, x, y):
+        loss, grads = loss_grad_fn(model, x, y)
+        optimizer.update(model, grads)
+        return loss
+
+    for x, y in dataset:
+        loss = step(model, x, y)
+        mx.eval(loss, model.parameters())
+
+All we have to do to average the gradients across machines is perform an
+:func:`all_sum` and divide by the size of the :class:`Group`. Namely we
+have to :func:`mlx.utils.tree_map` the gradients with following function.
+
+.. code:: python
+
+    def all_avg(x):
+        return mx.distributed.all_sum(x) / mx.distributed.init().size()
+
+Putting everything together our training loop step looks as follows with
+everything else remaining the same.
+
+.. code:: python
+
+    from mlx.utils import tree_map
+
+    def all_reduce_grads(grads):
+        N = mx.distributed.init().size()
+        if N == 1:
+            return grads
+        return tree_map(
+            lambda x: mx.distributed.all_sum(x) / N,
+            grads
+        )
+
+    def step(model, x, y):
+        loss, grads = loss_grad_fn(model, x, y)
+        grads = all_reduce_grads(grads)  # <--- This line was added
+        optimizer.update(model, grads)
+        return loss
+
+Tuning All Reduce
+-----------------
+
+We are working on improving the performance of all reduce on MLX but for now
+the two main things one can do to extract the most out of distributed training with MLX are:
+
+1. Perform a few large reductions instead of many small ones to improve
+   bandwidth and latency
+2. Pass ``--mca btl_tcp_links 4`` to ``mpirun`` to configure it to use 4 tcp
+   connections between each host to improve bandwidth
--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -0,0 +1,288 @@
+.. _export_usage:
+
+Exporting Functions
+===================
+
+.. currentmodule:: mlx.core
+
+MLX has an API to export and import functions to and from a file. This lets you
+run computations written in one MLX front-end (e.g. Python) in another MLX
+front-end (e.g. C++). 
+
+This guide walks through the basics of the MLX export API with some examples.
+To see the full list of functions check-out the :ref:`API documentation
+<export>`.
+
+Basics of Exporting 
+-------------------
+
+Let's start with a simple example:
+ 
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("add.mlxfn", fun, x, y)
+
+To export a function, provide sample input arrays that the function
+can be called with. The data doesn't matter, but the shapes and types of the
+arrays do. In the above example we exported ``fun`` with two ``float32``
+scalar arrays. We can then import the function and run it:
+
+.. code-block:: python
+
+  add_fun = mx.import_function("add.mlxfn")
+
+  out, = add_fun(mx.array(1.0), mx.array(2.0))
+  # Prints: array(3, dtype=float32)
+  print(out)
+
+  out, = add_fun(mx.array(1.0), mx.array(3.0))
+  # Prints: array(4, dtype=float32)
+  print(out)
+
+  # Raises an exception
+  add_fun(mx.array(1), mx.array(3.0))
+
+  # Raises an exception
+  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))
+
+Notice the third and fourth calls to ``add_fun`` raise exceptions because the
+shapes and types of the inputs are different than the shapes and types of the
+example inputs we exported the function with.
+
+Also notice that even though the original ``fun`` returns a single output
+array, the imported function always returns a tuple of one or more arrays.
+
+The inputs to :func:`export_function` and to an imported function can be
+specified as variable positional arguments or as a tuple of arrays:
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+   
+  # Both arguments to fun are positional
+  mx.export_function("add.mlxfn", fun, x, y)
+
+  # Same as above
+  mx.export_function("add.mlxfn", fun, (x, y))
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y)
+
+  # Also ok
+  out, = imported_fun((x, y))
+
+You can pass example inputs to functions as positional or keyword arguments. If
+you use keyword arguments to export the function, then you have to use the same
+keyword arguments when calling the imported function.
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  # One argument to fun is positional, the other is a kwarg
+  mx.export_function("add.mlxfn", fun, x, y=y)
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y=y)
+
+  # Also ok
+  out, = imported_fun((x,), {"y": y})
+
+  # Raises since the keyword argument is missing
+  out, = imported_fun(x, y)
+
+  # Raises since the keyword argument has the wrong key
+  out, = imported_fun(x, z=y)
+
+
+Exporting Modules
+-----------------
+
+An :obj:`mlx.nn.Module` can be exported with or without the parameters included
+in the exported function. Here's an example:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x):
+      return model(x)
+
+   mx.export_function("model.mlxfn", call, mx.zeros(4))
+
+In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
+parameters are also saved to the ``model.mlxfn`` file.
+
+.. note::
+
+   For enclosed arrays inside an exported function, be extra careful to ensure
+   they are evaluated. The computation graph that gets exported will include
+   the computation that produces enclosed inputs.
+  
+   If the above example was missing ``mx.eval(model.parameters()``, the
+   exported function would include the random initialization of the
+   :obj:`mlx.nn.Module` parameters.
+
+If you only want to export the ``Module.__call__`` function without the
+parameters, pass them as inputs to the ``call`` wrapper:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x, **params):
+     # Set the model's parameters to the input parameters
+     model.update(tree_unflatten(list(params.items())))
+     return model(x)
+ 
+   params = dict(tree_flatten(model.parameters()))
+   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)
+
+
+Shapeless Exports
+-----------------
+
+Just like :func:`compile`, functions can also be exported for dynamically shaped
+inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
+to export a function which can be used for inputs with variable shapes:
+
+.. code-block:: python
+
+  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  imported_abs = mx.import_function("fun.mlxfn")
+
+  # Ok
+  out, = imported_abs(mx.array(-1.0))
+  
+  # Also ok 
+  out, = imported_abs(mx.array([-1.0, -2.0]))
+
+With ``shapeless=False`` (which is the default), the second call to
+``imported_abs`` would raise an exception with a shape mismatch.
+
+Shapeless exporting works the same as shapeless compilation and should be
+used carefully. See the :ref:`documentation on shapeless compilation
+<shapeless_compile>` for more information.
+
+Exporting Multiple Traces
+-------------------------
+
+In some cases, functions build different computation graphs for different
+input arguments. A simple way to manage this is to export to a new file with
+each set of inputs. This is a fine option in many cases. But it can be
+suboptimal if the exported functions have a large amount of duplicate constant
+data (for example the parameters of a :obj:`mlx.nn.Module`).
+
+The export API in MLX lets you export multiple traces of the same function to
+a single file by creating an exporting context manager with :func:`exporter`:
+
+.. code-block:: python
+
+  def fun(x, y=None):
+      constant = mx.array(3.0)
+      if y is not None:
+        x += y 
+      return x + constant
+
+  with mx.exporter("fun.mlxfn", fun) as exporter:
+      exporter(mx.array(1.0))
+      exporter(mx.array(1.0), y=mx.array(0.0))
+
+  imported_function = mx.import_function("fun.mlxfn")
+
+  # Call the function with y=None
+  out, = imported_function(mx.array(1.0))
+  print(out)
+
+  # Call the function with y specified
+  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
+  print(out)
+
+In the above example the function constant data, (i.e. ``constant``), is only
+saved once. 
+
+Transformations with Imported Functions
+---------------------------------------
+
+Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
+on imported functions just like regular Python functions:
+
+.. code-block:: python
+
+  def fun(x):
+      return mx.sin(x)
+
+  x = mx.array(0.0)
+  mx.export_function("sine.mlxfn", fun, x)
+
+  imported_fun = mx.import_function("sine.mlxfn")
+
+  # Take the derivative of the imported function
+  dfdx = mx.grad(lambda x: imported_fun(x)[0])
+  # Prints: array(1, dtype=float32)
+  print(dfdx(x))
+
+  # Compile the imported function 
+  mx.compile(imported_fun)
+  # Prints: array(0, dtype=float32)
+  print(compiled_fun(x)[0])
+
+
+Importing Functions in C++
+--------------------------
+
+Importing and running functions in C++ is basically the same as importing and
+running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
+setup a simple C++ project that uses MLX as a library.
+
+Next, export a simple function from Python:
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(x + y)
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("fun.mlxfn", fun, x, y)
+
+
+Import and run the function in C++ with only a few lines of code:
+
+.. code-block:: c++
+
+  auto fun = mx::import_function("fun.mlxfn");
+
+  auto inputs = {mx::array(1.0), mx::array(1.0)};
+  auto outputs = fun(inputs);
+
+  // Prints: array(2, dtype=float32)
+  std::cout << outputs[0] << std::endl;
+
+Imported functions can be transformed in C++ just like in Python. Use 
+``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
+mx::array>`` for keyword arguments when calling imported functions in C++.
+
+More Examples
+-------------
+
+Here are a few more complete examples exporting more complex functions from
+Python and importing and running them in C++:
+
+* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_
--- a/docs/src/usage/function_transforms.rst
+++ b/docs/src/usage/function_transforms.rst
@@ -0,0 +1,191 @@
+.. _function_transforms:
+
+Function Transforms
+===================
+
+.. currentmodule:: mlx.core
+
+MLX uses composable function transformations for automatic differentiation,
+vectorization, and compute graph optimizations. To see the complete list of
+function transformations check-out the :ref:`API documentation <transforms>`.
+
+The key idea behind composable function transformations is that every
+transformation returns a function which can be further transformed.
+
+Here is a simple example:
+
+.. code-block:: shell
+
+   >>> dfdx = mx.grad(mx.sin)
+   >>> dfdx(mx.array(mx.pi))
+   array(-1, dtype=float32)
+   >>> mx.cos(mx.array(mx.pi))
+   array(-1, dtype=float32)
+
+
+The output of :func:`grad` on :func:`sin` is simply another function. In this
+case it is the gradient of the sine function which is exactly the cosine
+function. To get the second derivative you can do:
+
+.. code-block:: shell
+
+   >>> d2fdx2 = mx.grad(mx.grad(mx.sin))
+   >>> d2fdx2(mx.array(mx.pi / 2))
+   array(-1, dtype=float32)
+   >>> mx.sin(mx.array(mx.pi / 2))
+   array(1, dtype=float32)
+
+Using :func:`grad` on the output of :func:`grad` is always ok. You keep
+getting higher order derivatives.
+
+Any of the MLX function transformations can be composed in any order to any
+depth. See the following sections for more information on :ref:`automatic
+differentiation <auto diff>` and :ref:`automatic vectorization <vmap>`.
+For more information on :func:`compile` see the :ref:`compile documentation <compile>`.
+
+
+Automatic Differentiation
+-------------------------
+
+.. _auto diff:
+
+Automatic differentiation in MLX works on functions rather than on implicit
+graphs.
+
+.. note::
+
+   If you are coming to MLX from PyTorch, you no longer need functions like
+   ``backward``, ``zero_grad``, and ``detach``, or properties like
+   ``requires_grad``.
+
+The most basic example is taking the gradient of a scalar-valued function as we
+saw above. You can use the :func:`grad` and :func:`value_and_grad` function to
+compute gradients of more complex functions. By default these functions compute
+the gradient with respect to the first argument:
+
+.. code-block:: python
+
+   def loss_fn(w, x, y):
+      return mx.mean(mx.square(w * x - y))
+
+   w = mx.array(1.0)
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to w:
+   grad_fn = mx.grad(loss_fn)
+   dloss_dw = grad_fn(w, x, y)
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+   # To get the gradient with respect to x we can do:
+   grad_fn = mx.grad(loss_fn, argnums=1)
+   dloss_dx = grad_fn(w, x, y)
+   # Prints array([-1, 1], dtype=float32)
+   print(dloss_dx)
+
+
+One way to get the loss and gradient is to call ``loss_fn`` followed by
+``grad_fn``, but this can result in a lot of redundant work. Instead, you
+should use :func:`value_and_grad`. Continuing the above example:
+
+
+.. code-block:: python
+
+   # Computes the gradient of loss_fn with respect to w:
+   loss_and_grad_fn = mx.value_and_grad(loss_fn)
+   loss, dloss_dw = loss_and_grad_fn(w, x, y)
+
+   # Prints array(1, dtype=float32)
+   print(loss)
+
+   # Prints array(-1, dtype=float32)
+   print(dloss_dw)
+
+
+You can also take the gradient with respect to arbitrarily nested Python
+containers of arrays (specifically any of :obj:`list`, :obj:`tuple`, or
+:obj:`dict`).
+
+Suppose we wanted a weight and a bias parameter in the above example. A nice
+way to do that is the following:
+
+.. code-block:: python
+
+   def loss_fn(params, x, y):
+      w, b = params["weight"], params["bias"]
+      h = w * x + b
+      return mx.mean(mx.square(h - y))
+
+   params = {"weight": mx.array(1.0), "bias": mx.array(0.0)}
+   x = mx.array([0.5, -0.5])
+   y = mx.array([1.5, -1.5])
+
+   # Computes the gradient of loss_fn with respect to both the
+   # weight and bias:
+   grad_fn = mx.grad(loss_fn)
+   grads = grad_fn(params, x, y)
+
+   # Prints
+   # {'weight': array(-1, dtype=float32), 'bias': array(0, dtype=float32)}
+   print(grads)
+
+Notice the tree structure of the parameters is preserved in the gradients.
+
+In some cases you may want to stop gradients from propagating through a
+part of the function. You can use the :func:`stop_gradient` for that.
+
+
+Automatic Vectorization
+-----------------------
+
+.. _vmap:
+
+Use :func:`vmap` to automate vectorizing complex functions. Here we'll go
+through a basic and contrived example for the sake of clarity, but :func:`vmap`
+can be quite powerful for more complex functions which are difficult to optimize
+by hand.
+
+.. warning::
+
+   Some operations are not yet supported with :func:`vmap`. If you encounter an error
+   like: ``ValueError: Primitive's vmap not implemented.`` file an `issue
+   <https://github.com/ml-explore/mlx/issues>`_ and include your function.
+   We will prioritize including it.
+
+A naive way to add the elements from two sets of vectors is with a loop:
+
+.. code-block:: python
+
+  xs = mx.random.uniform(shape=(4096, 100))
+  ys = mx.random.uniform(shape=(100, 4096))
+
+  def naive_add(xs, ys):
+      return [xs[i] + ys[:, i] for i in range(xs.shape[0])]
+
+Instead you can use :func:`vmap` to automatically vectorize the addition:
+
+.. code-block:: python
+
+   # Vectorize over the second dimension of x and the
+   # first dimension of y
+   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(0, 1))
+
+The ``in_axes`` parameter can be used to specify which dimensions of the
+corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
+where the vectorized axes should be in the outputs.
+
+Let's time these two different versions:
+
+.. code-block:: python
+
+  import timeit
+
+  print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
+  print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))
+
+On an M1 Max the naive version takes in total ``5.639`` seconds whereas the
+vectorized version takes only ``0.024`` seconds, more than 200 times faster.
+
+Of course, this operation is quite contrived. A better approach is to simply do
+``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -0,0 +1,123 @@
+.. _indexing:
+
+Indexing Arrays
+===============
+
+.. currentmodule:: mlx.core
+
+For the most part, indexing an MLX :obj:`array` works the same as indexing a
+NumPy :obj:`numpy.ndarray`. See the `NumPy documentation
+<https://numpy.org/doc/stable/user/basics.indexing.html>`_ for more details on
+how that works.
+
+For example, you can use regular integers and slices (:obj:`slice`) to index arrays:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> arr[3]
+  array(3, dtype=int32)
+  >>> arr[-2]  # negative indexing works
+  array(8, dtype=int32)
+  >>> arr[2:8:2] # start, stop, stride
+  array([2, 4, 6], dtype=int32)
+
+For multi-dimensional arrays, the ``...`` or :obj:`Ellipsis` syntax works as in NumPy:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8).reshape(2, 2, 2)
+  >>> arr[:, :, 0]
+  array(3, dtype=int32)
+  array([[0, 2],
+         [4, 6]], dtype=int32
+  >>> arr[..., 0]
+  array([[0, 2],
+         [4, 6]], dtype=int32
+
+You can index with ``None`` to create a new axis:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(8)
+  >>> arr.shape
+  [8]
+  >>> arr[None].shape
+  [1, 8]
+
+
+You can also use an :obj:`array` to index another :obj:`array`:
+
+.. code-block:: shell
+
+  >>> arr = mx.arange(10)
+  >>> idx = mx.array([5, 7])
+  >>> arr[idx]
+  array([5, 7], dtype=int32)
+
+Mixing and matching integers, :obj:`slice`, ``...``, and :obj:`array` indices
+works just as in NumPy.
+
+Other functions which may be useful for indexing arrays are :func:`take` and
+:func:`take_along_axis`.
+
+Differences from NumPy
+----------------------
+
+.. Note::
+
+  MLX indexing is different from NumPy indexing in two important ways:
+
+  * Indexing does not perform bounds checking. Indexing out of bounds is
+    undefined behavior.
+  * Boolean mask based indexing is not yet supported.
+
+The reason for the lack of bounds checking is that exceptions cannot propagate
+from the GPU. Performing bounds checking for array indices before launching the
+kernel would be extremely inefficient.
+
+Indexing with boolean masks is something that MLX may support in the future. In
+general, MLX has limited support for operations for which output
+*shapes* are dependent on input *data*. Other examples of these types of
+operations which MLX does not yet support include :func:`numpy.nonzero` and the
+single input version of :func:`numpy.where`.
+
+In Place Updates
+----------------
+
+In place updates to indexed arrays are possible in MLX. For example:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> a[2] = 0
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Just as in NumPy, in place updates will be reflected in all references to the
+same array:
+
+.. code-block:: shell
+
+  >>> a = mx.array([1, 2, 3])
+  >>> b = a
+  >>> b[2] = 0
+  >>> b
+  array([1, 2, 0], dtype=int32)
+  >>> a
+  array([1, 2, 0], dtype=int32)
+
+Transformations of functions which use in-place updates are allowed and work as
+expected. For example:
+
+.. code-block:: python
+
+   def fun(x, idx):
+       x[idx] = 2.0
+       return x.sum()
+
+   dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0]), mx.array([1]))
+   print(dfdx)  # Prints: array([1, 0, 1], dtype=float32)
+
+In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
+and ones elsewhere.
--- a/docs/src/usage/lazy_evaluation.rst
+++ b/docs/src/usage/lazy_evaluation.rst
@@ -0,0 +1,144 @@
+.. _lazy eval:
+
+Lazy Evaluation
+===============
+
+.. currentmodule:: mlx.core
+
+Why Lazy Evaluation
+-------------------
+
+When you perform operations in MLX, no computation actually happens. Instead a
+compute graph is recorded. The actual computation only happens if an
+:func:`eval` is performed.
+
+MLX uses lazy evaluation because it has some nice features, some of which we
+describe below.
+
+Transforming Compute Graphs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Lazy evaluation lets us record a compute graph without actually doing any
+computations. This is useful for function transformations like :func:`grad` and
+:func:`vmap` and graph optimizations.
+
+Currently, MLX does not compile and rerun compute graphs. They are all
+generated dynamically. However, lazy evaluation makes it much easier to
+integrate compilation for future performance enhancements.
+
+Only Compute What You Use
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In MLX you do not need to worry as much about computing outputs that are never
+used. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      a = fun1(x)
+      b = expensive_fun(a)
+      return a, b
+
+  y, _ = fun(x)
+
+Here, we never actually compute the output of ``expensive_fun``. Use this
+pattern with care though, as the graph of ``expensive_fun`` is still built, and
+that has some cost associated to it.
+
+Similarly, lazy evaluation can be beneficial for saving memory while keeping
+code simple. Say you have a very large model ``Model`` derived from
+:obj:`mlx.nn.Module`. You can instantiate this model with ``model = Model()``.
+Typically, this will initialize all of the weights as ``float32``, but the
+initialization does not actually compute anything until you perform an
+:func:`eval`. If you update the model with ``float16`` weights, your maximum
+consumed memory will be half that required if eager computation was used
+instead.
+
+This pattern is simple to do in MLX thanks to lazy computation:
+
+.. code-block:: python
+
+  model = Model() # no memory used yet
+  model.load_weights("weights_fp16.safetensors")
+
+When to Evaluate
+----------------
+
+A common question is when to use :func:`eval`. The trade-off is between
+letting graphs get too large and not batching enough useful work.
+
+For example:
+
+.. code-block:: python
+
+  for _ in range(100):
+       a = a + b
+       mx.eval(a)
+       b = b * 2
+       mx.eval(b)
+
+This is a bad idea because there is some fixed overhead with each graph
+evaluation. On the other hand, there is some slight overhead which grows with
+the compute graph size, so extremely large graphs (while computationally
+correct) can be costly.
+
+Luckily, a wide range of compute graph sizes work pretty well with MLX:
+anything from a few tens of operations to many thousands of operations per
+evaluation should be okay.
+
+Most numerical computations have an iterative outer loop (e.g. the iteration in
+stochastic gradient descent). A natural and usually efficient place to use
+:func:`eval` is at each iteration of this outer loop.
+
+Here is a concrete example:
+
+.. code-block:: python
+
+   for batch in dataset:
+
+       # Nothing has been evaluated yet
+       loss, grad = value_and_grad_fn(model, batch)
+
+       # Still nothing has been evaluated
+       optimizer.update(model, grad)
+
+       # Evaluate the loss and the new parameters which will
+       # run the full gradient computation and optimizer update
+       mx.eval(loss, model.parameters())
+
+
+An important behavior to be aware of is when the graph will be implicitly
+evaluated. Anytime you ``print`` an array, convert it to an
+:obj:`numpy.ndarray`, or otherwise access its memory via :obj:`memoryview`,
+the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
+saving functions) will also evaluate the array.
+
+
+Calling :func:`array.item` on a scalar array will also evaluate it. In the
+example above, printing the loss (``print(loss)``) or adding the loss scalar to
+a list (``losses.append(loss.item())``) would cause a graph evaluation. If
+these lines are before ``mx.eval(loss, model.parameters())`` then this
+will be a partial evaluation, computing only the forward pass.
+
+Also, calling :func:`eval` on an array or set of arrays multiple times is
+perfectly fine. This is effectively a no-op.
+
+.. warning::
+
+  Using scalar arrays for control-flow will cause an evaluation.
+
+Here is an example:
+
+.. code-block:: python
+
+   def fun(x):
+       h, y = first_layer(x)
+       if y > 0:  # An evaluation is done here!
+           z  = second_layer_a(h)
+       else:
+           z  = second_layer_b(h)
+       return z
+
+Using arrays for control flow should be done with care. The above example works
+and can even be used with gradient transformations. However, this can be very
+inefficient if evaluations are done too frequently.
--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -0,0 +1,112 @@
+.. _numpy:
+
+Conversion to NumPy and Other Frameworks
+========================================
+
+MLX array supports conversion between other frameworks with either:
+
+* The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
+* `DLPack <https://dmlc.github.io/dlpack/latest/>`_.
+
+Let's convert an array to NumPy and back.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import numpy as np
+
+  a = mx.arange(3)
+  b = np.array(a) # copy of a
+  c = mx.array(b) # copy of b
+
+.. note::
+
+    Since NumPy does not support ``bfloat16`` arrays, you will need to convert to ``float16`` or ``float32`` first:
+    ``np.array(a.astype(mx.float32))``.
+    Otherwise, you will receive an error like: ``Item size 2 for PEP 3118 buffer format string does not match the dtype V item size 0.``
+
+By default, NumPy copies data to a new array. This can be prevented by creating an array view:
+
+.. code-block:: python
+
+  a = mx.arange(3)
+  a_view = np.array(a, copy=False)
+  print(a_view.flags.owndata) # False
+  a_view[0] = 1
+  print(a[0].item()) # 1
+
+A NumPy array view is a normal NumPy array, except that it does not own its memory.
+This means writing to the view is reflected in the original array.
+
+While this is quite powerful to prevent copying arrays, it should be noted that external changes to the memory of arrays cannot be reflected in gradients.
+
+Let's demonstrate this in an example:
+
+.. code-block:: python
+
+  def f(x):
+      x_view = np.array(x, copy=False)
+      x_view[:] *= x_view # modify memory without telling mx
+      return x.sum()
+
+  x = mx.array([3.0])
+  y, df = mx.value_and_grad(f)(x)
+  print("f(x) = x² =", y.item()) # 9.0
+  print("f'(x) = 2x !=", df.item()) # 1.0
+
+
+The function ``f`` indirectly modifies the array ``x`` through a memory view.
+However, this modification is not reflected in the gradient, as seen in the last line outputting ``1.0``,
+representing the gradient of the sum operation alone.
+The squaring of ``x`` occurs externally to MLX, meaning that no gradient is incorporated.
+It's important to note that a similar issue arises during array conversion and copying.
+For instance, a function defined as ``mx.array(np.array(x)**2).sum()`` would also result in an incorrect gradient,
+even though no in-place operations on MLX memory are executed.
+
+PyTorch
+-------
+
+.. warning::
+
+   PyTorch Support for :obj:`memoryview` is experimental and can break for
+   multi-dimensional arrays. Casting to NumPy first is advised for now.
+
+PyTorch supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import torch
+
+  a = mx.arange(3)
+  b = torch.tensor(memoryview(a))
+  c = mx.array(b.numpy())
+
+Conversion from PyTorch tensors back to arrays must be done via intermediate NumPy arrays with ``numpy()``.
+
+JAX
+---
+JAX fully supports the buffer protocol.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import jax.numpy as jnp
+
+  a = mx.arange(3)
+  b = jnp.array(a)
+  c = mx.array(b)
+
+TensorFlow
+----------
+
+TensorFlow supports the buffer protocol, but it requires an explicit :obj:`memoryview`.
+
+.. code-block:: python
+
+  import mlx.core as mx
+  import tensorflow as tf
+
+  a = mx.arange(3)
+  b = tf.constant(memoryview(a))
+  c = mx.array(b)
--- a/docs/src/usage/quick_start.rst
+++ b/docs/src/usage/quick_start.rst
@@ -40,6 +40,9 @@ automatically evaluate the array.
  >> np.array(c)   # Also evaluates c
  array([2., 4., 6., 8.], dtype=float32)

+
+See the page on :ref:`Lazy Evaluation <lazy eval>` for more details.
+
 Function and Graph Transformations
 ----------------------------------

@@ -61,11 +64,4 @@ Other gradient transformations include :func:`vjp` for vector-Jacobian products
 and :func:`jvp` for Jacobian-vector products.

 Use :func:`value_and_grad` to efficiently compute both a function's output and
-gradient with respect to the function's input. 
-
-
-Devices and Streams 
-------------------
-
-
-
+gradient with respect to the function's input.
--- a/Show More
+++ b/Show More