binding + tests

works
try dynamic reshape
2025-09-10 09:13:25 +08:00 · 2024-12-09 12:57:36 -08:00 · 2024-12-09 12:57:36 -08:00 · 2024-12-09 12:57:36 -08:00 · 2024-12-09 11:09:02 -08:00 · 2024-12-09 09:26:18 -08:00
298 changed files with 16408 additions and 10528 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,8 +13,62 @@ parameters:
  test_release:
    type: boolean
    default: false
+  linux_release:
+    type: boolean
+    default: false

 jobs:
+  build_documentation:
+    parameters:
+      upload-docs:
+        type: boolean
+        default: false
+    macos:
+      xcode: "15.2.0"
+    resource_class: macos.m1.medium.gen1
+    steps:
+      - checkout
+      - run:
+          name: Install
+          command: |
+            brew install python@3.9
+            brew install doxygen
+            python3.9 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install -r docs/requirements.txt
+            CMAKE_BUILD_PARALLEL_LEVEL=`sysctl -n hw.ncpu` pip install . -v
+      - when:
+          condition:
+            not: << parameters.upload-docs >>
+          steps:
+            - run:
+               name: Build documentation
+               command: |
+                 source env/bin/activate
+                 cd docs && doxygen && make html O=-W
+      - when:
+          condition: << parameters.upload-docs >>
+          steps:
+            - add_ssh_keys:
+                fingerprints:
+                  - "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
+            - run:
+               name: Upload documentation
+               command: |
+                 source env/bin/activate
+                 git config user.email "mlx@group.apple.com"
+                 git config user.name "CircleCI Docs"
+                 git checkout gh-pages
+                 git rebase main
+                 cd docs
+                 git rm -rf build/html
+                 doxygen && make html O=-W
+                 git add -f build/html
+                 git commit -m "rebase"
+                 git push -f origin gh-pages
+
  linux_build_and_test:
    docker:
      - image: cimg/python:3.9
@@ -31,7 +85,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install nanobind==2.2.0
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -77,13 +131,13 @@ jobs:
      - run:
          name: Install dependencies
          command: |
-            brew install python@3.8
+            brew install python@3.9
            brew install openmpi
-            python3.8 -m venv env
+            python3.9 -m venv env
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install nanobind==2.2.0
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -105,7 +159,7 @@ jobs:
            source env/bin/activate
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
-            mpirun -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
      - run:
          name: Build example extension
          command: |
@@ -172,7 +226,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -208,7 +262,7 @@ jobs:
      - store_artifacts:
          path: dist/

-  build_linux_test_release:
+  build_linux_release:
    parameters:
      python_version:
        type: string
@@ -237,12 +291,13 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.1.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
            pip install patchelf
            pip install build
+            pip install twine
            << parameters.extra_env >> \
              CMAKE_BUILD_PARALLEL_LEVEL=`nproc` \
              pip install . -v
@@ -253,6 +308,11 @@ jobs:
              python -m build --wheel
            auditwheel show dist/*
            auditwheel repair dist/* --plat manylinux_2_31_x86_64
+      - run:
+          name: Upload package
+          command: |
+            source env/bin/activate
+            twine upload wheelhouse/*
      - store_artifacts:
          path: wheelhouse/

@@ -272,6 +332,7 @@ workflows:
            parameters:
              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
      - linux_build_and_test
+      - build_documentation 

  build_pypi_release:
    when:
@@ -288,9 +349,17 @@ workflows:
              ignore: /.*/
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              xcode_version: ["15.0.0", "15.2.0"]
              build_env: ["PYPI_RELEASE=1"]
+      - build_documentation:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          upload-docs: true
+
  prb:
    when:
      matches:
@@ -317,7 +386,7 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              xcode_version: ["15.0.0", "15.2.0"]
  weekly_build:
    when:
@@ -328,17 +397,17 @@ workflows:
      - build_release:
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              xcode_version: ["15.0.0", "15.2.0", "16.0.0"]
              build_env: ["DEV_RELEASE=1"]
  linux_test_release:
    when:
      and:
        - equal: [ main, << pipeline.git.branch >> ]
-        - << pipeline.parameters.test_release >>
+        - << pipeline.parameters.linux_release >>
    jobs:
-      - build_linux_test_release:
+      - build_linux_release:
          matrix:
            parameters:
-              python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
              extra_env: ["PYPI_RELEASE=1"]
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,21 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v19.1.4
    hooks:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.8.0
+    rev: 24.10.0
    hooks:
    -   id: black
+    
 -   repo: https://github.com/pycqa/isort
    rev: 5.13.2
    hooks:
    -   id: isort
        args:
            - --profile=black
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.13
+  hooks:
+    - id: cmake-format
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -7,7 +7,7 @@ with a short description of your contribution(s) below. For example:

 MLX was developed with contributions from the following individuals:

- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`.
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
 - Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,24 @@
+cff-version: 1.2.0
+title: mlx
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Awni
+    family-names: Hannun
+    affiliation: Apple
+  - given-names: Jagrit
+    family-names: Digani
+    affiliation: Apple
+  - given-names: Angelos
+    family-names: Katharopoulos
+    affiliation: Apple
+  - given-names: Ronan
+    family-names: Collobert
+    affiliation: Apple
+repository-code: 'https://github.com/ml-explore'
+abstract: >-
+  MLX: efficient and flexible machine learning on Apple
+  silicon
+license: MIT
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,32 +24,34 @@ option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.17.3)
+  set(MLX_VERSION 0.21.1)
 endif()

 # --------------------- Processor tests -------------------------

-message(STATUS "Building MLX for ${CMAKE_SYSTEM_PROCESSOR} processor on ${CMAKE_SYSTEM_NAME}")
+message(
+  STATUS
+    "Building MLX for ${CMAKE_SYSTEM_PROCESSOR} processor on ${CMAKE_SYSTEM_NAME}"
+)

-set(MLX_BUILD_ARM OFF)
-
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
    if(NOT MLX_ENABLE_X64_MAC)
-      message(FATAL_ERROR
-        "Building for x86_64 on macOS is not supported."
-        " If you are on an Apple silicon system, check the build"
-        " documentation for possible fixes: "
-        "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source")
+      message(
+        FATAL_ERROR
+          "Building for x86_64 on macOS is not supported."
+          " If you are on an Apple silicon system, check the build"
+          " documentation for possible fixes: "
+          "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source"
+      )
    else()
+      set(MLX_BUILD_METAL OFF)
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
-    set(MLX_BUILD_METAL OFF)
-  elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
-    set(MLX_BUILD_ARM ON)
  endif()

 else()
+  set(MLX_BUILD_METAL OFF)
  message(WARNING "MLX is prioritised for Apple silicon systems using macOS.")
 endif()

@@ -61,63 +63,59 @@ cmake_policy(SET CMP0135 NEW)

 add_library(mlx)

-if (MLX_BUILD_METAL)
-  find_library(METAL_LIB Metal)
-  find_library(FOUNDATION_LIB Foundation)
-  find_library(QUARTZ_LIB QuartzCore)
+if(MLX_BUILD_METAL)
+  set(METAL_LIB "-framework Metal")
+  set(FOUNDATION_LIB "-framework Foundation")
+  set(QUARTZ_LIB "-framework QuartzCore")
 endif()

-if (MLX_BUILD_METAL AND NOT METAL_LIB)
+if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
  set(MLX_BUILD_METAL OFF)
  set(MLX_METAL_DEBUG OFF)
-elseif (MLX_BUILD_METAL)
+elseif(MLX_BUILD_METAL)
  message(STATUS "Building METAL sources")

-  if (MLX_METAL_DEBUG)
+  if(MLX_METAL_DEBUG)
    add_compile_definitions(MLX_METAL_DEBUG)
  endif()

  # Throw an error if xcrun not found
-  execute_process(COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
-                  OUTPUT_VARIABLE MACOS_VERSION
-                  COMMAND_ERROR_IS_FATAL ANY)
-
-  if (${MACOS_VERSION} LESS 14.0)
-    message(FATAL_ERROR "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON" )
-  endif()
-  message(STATUS "Building with SDK for macOS version ${MACOS_VERSION}")
-
-  set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip)
-  # Get the metal version
  execute_process(
-    COMMAND zsh "-c" "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal -E -x metal -P - | tail -1 | tr -d '\n'"
-    OUTPUT_VARIABLE MLX_METAL_VERSION
-    COMMAND_ERROR_IS_FATAL ANY)
+    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
+    OUTPUT_VARIABLE MACOS_SDK_VERSION COMMAND_ERROR_IS_FATAL ANY)

-  FetchContent_Declare(
-    metal_cpp
-    URL ${METAL_CPP_URL}
+  if(${MACOS_SDK_VERSION} LESS 14.0)
+    message(
+      FATAL_ERROR
+        "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON")
+  endif()
+  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")
+
+  set(METAL_CPP_URL
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip
  )

+  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
+    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
+  endif()
+  execute_process(
+    COMMAND
+      zsh "-c"
+      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
+    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
+  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
+
  FetchContent_MakeAvailable(metal_cpp)
  target_include_directories(
-    mlx PUBLIC
-    $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include/metal_cpp>
-  )
-  target_link_libraries(
-    mlx PUBLIC
-    ${METAL_LIB}
-    ${FOUNDATION_LIB}
-    ${QUARTZ_LIB})
-
-  add_compile_definitions("MLX_METAL_VERSION=${MLX_METAL_VERSION}")
+    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
+               $<INSTALL_INTERFACE:include/metal_cpp>)
+  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

-if (MLX_BUILD_CPU)
+if(MLX_BUILD_CPU)
  find_library(ACCELERATE_LIBRARY Accelerate)
-  if (MLX_BUILD_ARM AND ACCELERATE_LIBRARY)
+  if(ACCELERATE_LIBRARY)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
@@ -129,139 +127,135 @@ if (MLX_BUILD_CPU)
      # The blas shipped in macOS SDK is not supported, search homebrew for
      # openblas instead.
      set(BLA_VENDOR OpenBLAS)
-      set(LAPACK_ROOT "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
+      set(LAPACK_ROOT
+          "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
    endif()
    # Search and link with lapack.
    find_package(LAPACK REQUIRED)
-    if (NOT LAPACK_FOUND)
+    if(NOT LAPACK_FOUND)
      message(FATAL_ERROR "Must have LAPACK installed")
    endif()
-    find_path(LAPACK_INCLUDE_DIRS lapacke.h
-      /usr/include
-      /usr/local/include
-      /usr/local/opt/openblas/include)
+    find_path(LAPACK_INCLUDE_DIRS lapacke.h /usr/include /usr/local/include
+              /usr/local/opt/openblas/include)
    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
-    # List blas after lapack otherwise we may accidentally incldue an old version
-    # of lapack.h from the include dirs of blas.
+    # List blas after lapack otherwise we may accidentally incldue an old
+    # version of lapack.h from the include dirs of blas.
    find_package(BLAS REQUIRED)
-    if (NOT BLAS_FOUND)
+    if(NOT BLAS_FOUND)
      message(FATAL_ERROR "Must have BLAS installed")
    endif()
    # TODO find a cleaner way to do this
-    find_path(BLAS_INCLUDE_DIRS cblas.h
-      /usr/include
-      /usr/local/include
-      $ENV{BLAS_HOME}/include)
+    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include /usr/local/include
+              $ENV{BLAS_HOME}/include)
    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
+
+    if(WIN32)
+      find_package(dlfcn-win32 REQUIRED)
+      message(STATUS "dlfcn-win32 lib " ${dlfcn-win32_LIBRARIES})
+      message(STATUS "dlfcn-win32 include " ${dlfcn-win32_INCLUDE_DIRS})
+      target_link_libraries(mlx PUBLIC ${dlfcn-win32_LIBRARIES})
+    endif()
  endif()
 else()
  set(MLX_BUILD_ACCELERATE OFF)
 endif()

 find_package(MPI)
-if (MPI_FOUND)
+if(MPI_FOUND)
  execute_process(
    COMMAND zsh "-c" "mpirun --version"
    OUTPUT_VARIABLE MPI_VERSION
-    ERROR_QUIET
-  )
-  if (${MPI_VERSION} MATCHES ".*Open MPI.*")
+    ERROR_QUIET)
+  if(${MPI_VERSION} MATCHES ".*Open MPI.*")
    target_include_directories(mlx PRIVATE ${MPI_INCLUDE_PATH})
-  elseif (MPI_VERSION STREQUAL "")
+  elseif(MPI_VERSION STREQUAL "")
    set(MPI_FOUND FALSE)
    message(
-      WARNING
-      "MPI found but mpirun is not available. Building without MPI."
-    )
+      WARNING "MPI found but mpirun is not available. Building without MPI.")
  else()
    set(MPI_FOUND FALSE)
-    message(
-      WARNING
-      "MPI which is not OpenMPI found. Building without MPI."
-    )
-  endif() 
+    message(WARNING "MPI which is not OpenMPI found. Building without MPI.")
+  endif()
 endif()

 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

 target_include_directories(
-  mlx
-  PUBLIC
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-  $<INSTALL_INTERFACE:include>
-)
+  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+             $<INSTALL_INTERFACE:include>)

-FetchContent_Declare(fmt
+FetchContent_Declare(
+  fmt
  GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-  GIT_TAG 10.2.1 
-  EXCLUDE_FROM_ALL
-)
+  GIT_TAG 10.2.1
+  EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(fmt)
-target_link_libraries(mlx PRIVATE fmt::fmt-header-only)
+target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)

-if (MLX_BUILD_PYTHON_BINDINGS)
+if(MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
-  find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+  find_package(
+    Python 3.8
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
  execute_process(
    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
-    OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE NB_DIR)
  list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
  find_package(nanobind CONFIG REQUIRED)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
 endif()

-if (MLX_BUILD_TESTS)
+if(MLX_BUILD_TESTS)
  include(CTest)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tests)
 endif()

-if (MLX_BUILD_EXAMPLES)
+if(MLX_BUILD_EXAMPLES)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples/cpp)
 endif()

-if (MLX_BUILD_BENCHMARKS)
+if(MLX_BUILD_BENCHMARKS)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/benchmarks/cpp)
 endif()

-
-
 # ----------------------------- Installation -----------------------------
 include(GNUInstallDirs)

 # Install library
 install(
-    TARGETS mlx
-    EXPORT MLXTargets
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-
+  TARGETS mlx
+  EXPORT MLXTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  INCLUDES
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

 # Install headers
 install(
-    DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    COMPONENT headers
-    FILES_MATCHING PATTERN "*.h"
-)
+  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT headers
+  FILES_MATCHING
+  PATTERN "*.h"
+  PATTERN "backend/metal/kernels.h" EXCLUDE)

 # Install metal dependencies
-if (MLX_BUILD_METAL)
+if(MLX_BUILD_METAL)

  # Install metal cpp
  install(
-      DIRECTORY ${metal_cpp_SOURCE_DIR}/
-      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
-      COMPONENT metal_cpp_source
-  )
+    DIRECTORY ${metal_cpp_SOURCE_DIR}/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
+    COMPONENT metal_cpp_source)

 endif()

@@ -273,31 +267,24 @@ set(MLX_CMAKE_INSTALL_MODULE_DIR share/cmake/MLX)
 install(
  EXPORT MLXTargets
  FILE MLXTargets.cmake
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

 include(CMakePackageConfigHelpers)

 write_basic_package_version_file(
  ${MLX_CMAKE_BUILD_VERSION_CONFIG}
  COMPATIBILITY SameMajorVersion
-  VERSION ${MLX_VERSION}
-)
+  VERSION ${MLX_VERSION})

 configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in
-  ${MLX_CMAKE_BUILD_CONFIG}
+  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in ${MLX_CMAKE_BUILD_CONFIG}
  INSTALL_DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
  NO_CHECK_REQUIRED_COMPONENTS_MACRO
-  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR MLX_CMAKE_INSTALL_MODULE_DIR
-)
+  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR
+            MLX_CMAKE_INSTALL_MODULE_DIR)

-install(
-  FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+install(FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

-install(
-  DIRECTORY ${CMAKE_MODULE_PATH}/
-  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
-)
+install(DIRECTORY ${CMAKE_MODULE_PATH}/
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@

 [![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)

-MLX is an array framework for machine learning research on Apple silicon,
+MLX is an array framework for machine learning on Apple silicon,
 brought to you by Apple machine learning research.

 Some key features of MLX include:
--- a/benchmarks/python/comparative/bench_mlx.py
+++ b/benchmarks/python/comparative/bench_mlx.py
@@ -144,6 +144,13 @@ def reduction(op, axis, x):
    mx.eval(ys)


+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    mx.eval(z)
+
+
 def softmax(axis, x):
    ys = []
    for i in range(100):
@@ -505,5 +512,8 @@ if __name__ == "__main__":
    elif args.benchmark == "selu":
        print(bench(selu, x))

+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+
    else:
        raise ValueError("Unknown benchmark")
--- a/benchmarks/python/conv2d_bench_cpu.py
+++ b/benchmarks/python/conv2d_bench_cpu.py
@@ -0,0 +1,127 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_2D
+
+
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+
+    return pt_conv_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("cpu")
+
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv2d_train_bench_cpu.py
+++ b/benchmarks/python/conv2d_train_bench_cpu.py
@@ -0,0 +1,143 @@
+import time
+
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+
+
+def bench_mlx(steps: int = 20) -> float:
+    mx.set_default_device(mx.cpu)
+
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def __call__(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+
+    inputs = mx.random.normal([10, 256, 256, 3])
+
+    params = benchNet.parameters()
+    optim.init(params)
+
+    state = [benchNet.state, optim.state]
+
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def bench_torch(steps: int = 20) -> float:
+    device = torch.device("cpu")
+
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def forward(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+
+    inputs = torch.randn(10, 3, 256, 256, device=device)
+
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def main():
+    steps = 20
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/python/conv2d_transpose_bench_cpu.py
+++ b/benchmarks/python/conv2d_transpose_bench_cpu.py
@@ -0,0 +1,129 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_transpose_2D
+
+
+def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+
+    return pt_conv_transpose_2D
+
+
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (int(O / groups), kH, kW, C)).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("cpu")
+
+    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
+    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv_transpose2d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+    )
+    out_pt = torch.conv_transpose2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv3d_bench_cpu.py
+++ b/benchmarks/python/conv3d_bench_cpu.py
@@ -0,0 +1,110 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_3D
+
+
+def make_pt_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+
+    return pt_conv_3D
+
+
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/conv3d_train_bench_cpu.py
+++ b/benchmarks/python/conv3d_train_bench_cpu.py
@@ -0,0 +1,143 @@
+import time
+
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+
+
+def bench_mlx(steps: int = 20, shape=(10, 32, 32, 32, 3)) -> float:
+    mx.set_default_device(mx.cpu)
+
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def __call__(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+
+    inputs = mx.random.normal(shape)
+
+    params = benchNet.parameters()
+    optim.init(params)
+
+    state = [benchNet.state, optim.state]
+
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def bench_torch(steps: int = 20, shape=(10, 3, 32, 32, 32)) -> float:
+    device = torch.device("cpu")
+
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+
+        def forward(self, input):
+            return self.net(input)
+
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+
+    inputs = torch.randn(*shape, device=device)
+
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+
+        end_time = time.perf_counter()
+
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+
+    return total_time
+
+
+def main():
+    steps = 10
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/python/conv3d_transpose_bench_cpu.py
+++ b/benchmarks/python/conv3d_transpose_bench_cpu.py
@@ -0,0 +1,116 @@
+import argparse
+import math
+import time
+
+import mlx.core as mx
+import numpy as np
+import torch
+
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def make_mx_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+
+    return mx_conv_3D
+
+
+def make_pt_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+
+    return pt_conv_3D
+
+
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((4, 0, 1, 2, 3))).to("cpu")
+
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    out_mx = mx.conv_transpose3d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.conv_transpose3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/scatter_bench.py
+++ b/benchmarks/python/scatter_bench.py
@@ -9,7 +9,7 @@ from time_utils import measure_runtime

 def benchmark_scatter_mlx(dst_shape, x_shape, idx_shapes):
    def scatter(dst, x, idx):
-        dst[*idx] = x
+        dst[tuple(idx)] = x
        mx.eval(dst)

    idx = []
@@ -23,8 +23,8 @@ def benchmark_scatter_mlx(dst_shape, x_shape, idx_shapes):


 def benchmark_scatter_torch(dst_shape, x_shape, idx_shapes, device):
-    def gather(dst, x, idx, device):
-        dst[*idx] = x
+    def scatter(dst, x, idx, device):
+        dst[tuple(idx)] = x
        if device == torch.device("mps"):
            torch.mps.synchronize()

@@ -34,7 +34,7 @@ def benchmark_scatter_torch(dst_shape, x_shape, idx_shapes, device):
    x = torch.randn(x_shape, dtype=torch.float32).to(device)
    dst = torch.randn(dst_shape, dtype=torch.float32).to(device)

-    runtime = measure_runtime(gather, dst=dst, x=x, idx=idx, device=device)
+    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx, device=device)
    print(f"PyTorch: {runtime:.3f}ms")


@@ -54,7 +54,7 @@ if __name__ == "__main__":
        (100_000, 64),
        (1_000_000, 64),
        (100_000,),
-        (2_000_00,),
+        (200_000,),
        (20_000_000,),
        (10000, 64),
        (100, 64),
@@ -91,6 +91,6 @@ if __name__ == "__main__":

    for dst_shape, x_shape, idx_shape in zip(dst_shapes, x_shapes, idx_shapes):
        print("=" * 20)
-        print(f"X {x_shape}, Indices {idx_shape}")
+        print(f"Dst: {dst_shape}, X {x_shape}, Indices {idx_shape}")
        benchmark_scatter_mlx(dst_shape, x_shape, idx_shape)
        benchmark_scatter_torch(dst_shape, x_shape, idx_shape, device=device)
--- a/benchmarks/python/sdpa_bench.py
+++ b/benchmarks/python/sdpa_bench.py
@@ -1,62 +1,189 @@
+# Copyright © 2024 Apple Inc.
+
 import argparse
 import math
+import os
+import subprocess
+import time

 import mlx.core as mx
-from time_utils import time_fn
+import numpy as np

-MAX_SEQ = 300
-START_SEQ = 100
-SEQ_INCREMENT = 50
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 5
+N_iter_bench = 40
+N_iter_func = 8


-def time_self_attention_primitives():
-    mx.random.seed(3)
-    B = 2
-    H = 38
-    D = 64
-    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
-        q = mx.random.uniform(shape=(B, H, R, D))
-        k = mx.random.uniform(shape=(B, H, R, D))
-        v = mx.random.uniform(shape=(B, H, R, D))
-        scale = 1.0 / math.sqrt(float(D))
-        mx.eval(q, k, v)
+def bench(f, *args):
+    for i in range(N_warmup):
+        f(*args)

-        def sdpa_primitives(qs, ks, vs, alpha):
-            s = (alpha * qs) @ ks.transpose(0, 1, 3, 2)
-            p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
-            o = p @ vs
-            return o
-
-        time_fn(sdpa_primitives, q, k, v, scale)
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(*args)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9


-def time_self_attention_sdpa():
-    mx.random.seed(3)
-    B = 2
-    H = 38
-    D = 64
-    for R in range(START_SEQ, MAX_SEQ, SEQ_INCREMENT):
-        q = mx.random.uniform(shape=(B, H, R, D))
-        k = mx.random.uniform(shape=(B, H, R, D))
-        v = mx.random.uniform(shape=(B, H, R, D))
-        scale = 1.0 / math.sqrt(float(D))
-        mx.eval(q, k, v)
+def mlx_sdpa_fused_inner(q, k, v, scale):
+    return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=None)

-        def sdpa_fused(qs, ks, vs, alpha):
-            o = mx.fast.scaled_dot_product_attention(qs, ks, vs, scale=alpha)
-            return o

-        time_fn(sdpa_fused, q, k, v, scale)
+def mlx_sdpa_unfused_inner(q, k, v, scale, f32softmax=False):
+    q_dtype = q.dtype
+    q = q * mx.array(scale, q_dtype)
+    n_q_heads = q.shape[-3]
+    n_kv_heads = k.shape[-3]
+    n_repeats = n_q_heads // n_kv_heads
+
+    B = q.shape[0]
+    L = q.shape[2]
+
+    if n_repeats > 1:
+        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
+        k = mx.expand_dims(k, 2)
+        v = mx.expand_dims(v, 2)
+
+    scores = q @ mx.swapaxes(k, -1, -2)
+    if f32softmax:
+        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(q_dtype)
+    else:
+        scores = mx.softmax(scores, axis=-1)
+
+    out = scores @ v
+    if n_repeats > 1:
+        out = mx.reshape(out, [B, n_q_heads, L, -1])
+
+    return out
+
+
+def mlx_spda_unfused(q, k, v, scale, transpose):
+    q_out = q
+    if transpose:
+        k = mx.transpose(k, (0, 2, 1, 3))
+        v = mx.transpose(v, (0, 2, 1, 3))
+
+    for i in range(N_iter_func):
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+        q_out = mlx_sdpa_unfused_inner(q_out, k, v, scale)
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+
+    mx.eval(q_out)
+    return q_out
+
+
+def mlx_spda_fused(q, k, v, scale, transpose):
+    q_out = q
+    if transpose:
+        k = mx.transpose(k, (0, 2, 1, 3))
+        v = mx.transpose(v, (0, 2, 1, 3))
+
+    for i in range(N_iter_func):
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+        q_out = mlx_sdpa_fused_inner(q_out, k, v, scale)
+        if transpose:
+            q_out = mx.transpose(q_out, (0, 2, 1, 3))
+
+    mx.eval(q_out)
+    return q_out
+
+
+def bench_shape(B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose=True):
+    shape_q = (
+        (B, qsl, n_q_heads, head_dim) if transpose else (B, n_q_heads, qsl, head_dim)
+    )
+    shape_kv = (
+        (B, ksl, n_kv_heads, head_dim) if transpose else (B, n_kv_heads, ksl, head_dim)
+    )
+
+    q_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_q).astype(np_dtype)
+    k_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
+    v_np = np.random.normal(0.0, 1.0 / math.sqrt(head_dim), shape_kv).astype(np_dtype)
+
+    scale = math.sqrt(1.0 / head_dim)
+
+    q_mx = mx.array(q_np)
+    k_mx = mx.array(k_np)
+    v_mx = mx.array(v_np)
+
+    time_mlx_unfused = bench(mlx_spda_unfused, q_mx, k_mx, v_mx, scale, transpose)
+    time_mlx_fused = bench(mlx_spda_fused, q_mx, k_mx, v_mx, scale, transpose)
+
+    if transpose:
+        q_mx = mx.transpose(q_mx, (0, 2, 1, 3))
+        k_mx = mx.transpose(k_mx, (0, 2, 1, 3))
+        v_mx = mx.transpose(v_mx, (0, 2, 1, 3))
+
+    o_mlx_fused = mlx_sdpa_fused_inner(q_mx, k_mx, v_mx, scale)
+    o_mlx_unfused = mlx_sdpa_unfused_inner(q_mx, k_mx, v_mx, scale, f32softmax=True)
+
+    atol = 1e-5 if np_dtype == np.float32 else 1e-4
+
+    if not mx.allclose(o_mlx_fused, o_mlx_unfused, atol=atol):
+        print(
+            f"Failed at (B: {B}, qsl: {qsl}, ksl: {ksl}, head_dim: {head_dim}, n_qh: {n_q_heads}, n_kvh: {n_kv_heads}) [tpose = {transpose}] with max(|a - b|) = {mx.max(mx.abs(o_mlx_unfused - o_mlx_fused)):3.2e}"
+        )
+
+    return time_mlx_fused, time_mlx_unfused
+
+
+def get_gflop_count(B, M, N, K):
+    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser("MLX benchmarks.")
-    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
-    args = parser.parse_args()
-    if args.gpu:
-        mx.set_default_device(mx.gpu)
-    else:
-        mx.set_default_device(mx.cpu)
+    parser = argparse.ArgumentParser(description="Run gemm benchmarks")

-    time_self_attention_sdpa()
-    time_self_attention_primitives()
+    dtypes = ("float16", "float32")[:1]
+    transposes = (False,)
+
+    # fmt: off
+    shapes_64 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,    32,    32,       64,   32,    32),
+          (  1,    64,    64,       64,   32,    32),
+          (  1,   128,   128,       64,   32,    32),
+          (  1,   256,   256,       64,   32,    32),
+          (  1,   512,   512,       64,   32,    32),
+          (  1,  1024,  1024,       64,   32,    32),
+          (  1,  2048,  2048,       64,   32,    32),
+          (  1,  4096,  4096,       64,   32,    32),
+    )
+
+    shapes_80 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,  1024,  1024,       80,   32,    32),
+          (  1,  2048,  2048,       80,   32,    32),
+          (  1,  4096,  4096,       80,   32,    32),
+    )
+
+    shapes_128 = (
+        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
+          (  1,  1024,  1024,      128,   32,    32),
+          (  1,  2048,  2048,      128,   32,    32),
+          (  1,  4096,  4096,      128,   32,    32),
+    )
+    # fmt: on
+
+    shapes = shapes_64 + shapes_80 + shapes_128
+
+    print("  B,   qsl,   ksl, hdim, n_qh, n_kvh, tpose,   dtype, t_unfs, t_fuse, diff%")
+
+    for dtype in dtypes:
+        for transpose in transposes:
+            for B, qsl, ksl, head_dim, n_q_heads, n_kv_heads in shapes:
+                np_dtype = getattr(np, dtype)
+                time_mlx_fused, time_mlx_unfused = bench_shape(
+                    B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, np_dtype, transpose
+                )
+                diff = time_mlx_unfused / time_mlx_fused - 1.0
+                t_str = 1 if transpose else 0
+                print(
+                    f"{B:3d}, {qsl:5d}, {ksl:5d}, {head_dim:4d}, {n_q_heads:4d}, {n_kv_heads:5d}, {t_str:5d}, {dtype}, {time_mlx_unfused: 2.3f}, {time_mlx_fused: 2.3f}, {100. * diff:+5.2f}%"
+                )
--- a/benchmarks/python/sdpa_vector_bench.py
+++ b/benchmarks/python/sdpa_vector_bench.py
@@ -0,0 +1,58 @@
+import argparse
+import math
+
+import mlx.core as mx
+from time_utils import time_fn
+
+L = 16384
+H = 32
+H_k = H // 4
+D = 128
+dtype = mx.float16
+loops = 10
+
+
+def attention(q, k, v):
+    def _sdpa(q, k, v):
+        B, Hq, L, D = q.shape
+        _, Hk, S, _ = k.shape
+        q = q.reshape(B, Hk, Hq // Hk, L, D)
+        k = k[:, :, None, :, :]
+        v = v[:, :, None, :, :]
+        s = q @ k.transpose(0, 1, 2, 4, 3)
+        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
+        o = p @ v
+        return o.reshape(B, Hq, L, D)
+
+    for i in range(loops):
+        q = _sdpa(q, k, v)
+    return q
+
+
+def sdpa(q, k, v):
+    for i in range(loops):
+        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0)
+    return q
+
+
+def time_self_attention_primitives():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(attention, q, k, v)
+
+
+def time_self_attention_sdpa():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(sdpa, q, k, v)
+
+
+if __name__ == "__main__":
+    time_self_attention_sdpa()
+    time_self_attention_primitives()
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -1,56 +1,41 @@
 include(CMakeParseArguments)

-###############################################################################
+# ##############################################################################
 # Build metal library
 #
 # Adds a custom target ${TARGET} to build ${OUTPUT_DIRECTORY}/{TITLE}.metallib
 # from list ${SOURCES}, including list ${INCLUDE_DIRS}, depends on list ${DEPS}
 #
-# Args:
-#     TARGET: Custom target to be added for the metal library 
-#     TITLE: Name of the .metallib
-#     OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib
-#     SOURCES: List of source files
-#     INCLUDE_DIRS: List of include dirs
-#     DEPS: List of dependency files (like headers)
+# Args: TARGET: Custom target to be added for the metal library TITLE: Name of
+# the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
+# of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
+# files (like headers)
 #
 macro(mlx_build_metallib)
  # Parse args
  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
-  cmake_parse_arguments(
-      MTLLIB 
-      ""
-      "${oneValueArgs}"
-      "${multiValueArgs}" 
-      ${ARGN}
-  )
+  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  # Set output
  set(MTLLIB_BUILD_TARGET "${MTLLIB_OUTPUT_DIRECTORY}/${MTLLIB_TITLE}.metallib")

-  # Collect compile options 
+  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math)

  # Prepare metallib build command
  add_custom_command(
    OUTPUT ${MTLLIB_BUILD_TARGET}
-    COMMAND xcrun -sdk macosx metal 
-                  "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
-                  ${MTLLIB_COMPILE_OPTIONS}
-                  ${MTLLIB_SOURCES}
-                  -o ${MTLLIB_BUILD_TARGET}
+    COMMAND
+      xcrun -sdk macosx metal
+      "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
+      ${MTLLIB_COMPILE_OPTIONS} ${MTLLIB_SOURCES} -o ${MTLLIB_BUILD_TARGET}
    DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
    COMMAND_EXPAND_LISTS
    COMMENT "Building ${MTLLIB_TITLE}.metallib"
-    VERBATIM
-  )
+    VERBATIM)

  # Add metallib custom target
-  add_custom_target(
-    ${MTLLIB_TARGET}
-    DEPENDS
-    ${MTLLIB_BUILD_TARGET}
-  )
+  add_custom_target(${MTLLIB_TARGET} DEPENDS ${MTLLIB_BUILD_TARGET})

-endmacro(mlx_build_metallib)
+endmacro(mlx_build_metallib)
--- a/docs/src/conf.py
+++ b/docs/src/conf.py
@@ -60,6 +60,7 @@ html_theme_options = {
    },
 }

+html_favicon = html_theme_options["logo"]["image_light"]

 # -- Options for HTMLHelp output ---------------------------------------------

--- a/docs/src/dev/custom_metal_kernels.rst
+++ b/docs/src/dev/custom_metal_kernels.rst
@@ -1,3 +1,5 @@
+.. _custom_metal_kernels:
+
 Custom Metal Kernels
 ====================

@@ -76,6 +78,10 @@ Putting this all together, the generated function signature for ``myexp`` is as

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
+This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
+For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+
 Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.

 Using Shape/Strides
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -420,8 +420,8 @@ element in the output.
            constant const float& alpha [[buffer(3)]],
            constant const float& beta [[buffer(4)]],
            constant const int* shape [[buffer(5)]],
-            constant const size_t* x_strides [[buffer(6)]],
-            constant const size_t* y_strides [[buffer(7)]],
+            constant const int64_t* x_strides [[buffer(6)]],
+            constant const int64_t* y_strides [[buffer(7)]],
            constant const int& ndim [[buffer(8)]],
            uint index [[thread_position_in_grid]]) {
        // Convert linear indices to offsets in array
@@ -438,24 +438,10 @@ each instantiation a unique host name so we can identify it.

 .. code-block:: C++

-    #define instantiate_axpby(type_name, type)              \
-        template [[host_name("axpby_general_" #type_name)]] \
-        [[kernel]] void axpby_general<type>(                \
-            device const type* x [[buffer(0)]],             \
-            device const type* y [[buffer(1)]],             \
-            device type* out [[buffer(2)]],                 \
-            constant const float& alpha [[buffer(3)]],      \
-            constant const float& beta [[buffer(4)]],       \
-            constant const int* shape [[buffer(5)]],        \
-            constant const size_t* x_strides [[buffer(6)]], \
-            constant const size_t* y_strides [[buffer(7)]], \
-            constant const int& ndim [[buffer(8)]],         \
-            uint index [[thread_position_in_grid]]);
-
-    instantiate_axpby(float32, float);
-    instantiate_axpby(float16, half);
-    instantiate_axpby(bfloat16, bfloat16_t);
-    instantiate_axpby(complex64, complex64_t);
+    instantiate_kernel("axpby_general_float32", axpby_general, float)
+    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
+    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
+    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)

 The logic to determine the kernel, set the inputs, resolve the grid dimensions,
 and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown
@@ -494,7 +480,7 @@ below.

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
-        compute_encoder->setComputePipelineState(kernel);
+        compute_encoder.set_compute_pipeline_state(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
        // those in the kernel declaration at axpby.metal
@@ -509,14 +495,14 @@ below.
        compute_encoder.set_output_array(out, 2);

        // Encode alpha and beta
-        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
-        compute_encoder->setBytes(&beta_, sizeof(float), 4);
+        compute_encoder.set_bytes(alpha_, 3);
+        compute_encoder.set_bytes(beta_, 4);

        // Encode shape, strides and ndim
-        compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
-        compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
-        compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
-        compute_encoder->setBytes(&ndim, sizeof(int), 8);
+        compute_encoder.set_vector_bytes(x.shape(), 5);
+        compute_encoder.set_vector_bytes(x.strides(), 6);
+        compute_encoder.set_bytes(y.strides(), 7);
+        compute_encoder.set_bytes(ndim, 8);

        // We launch 1 thread for each input and make sure that the number of
        // threads in any given threadgroup is not higher than the max allowed
@@ -530,7 +516,7 @@ below.

        // Launch the grid with the given number of threads divided among
        // the given threadgroups
-        compute_encoder.dispatchThreads(grid_dims, group_dims);
+        compute_encoder.dispatch_threads(grid_dims, group_dims);
    }

 We can now call the :meth:`axpby` operation on both the CPU and the GPU!
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -14,7 +14,7 @@ silicon computer is
 To install from PyPI you must meet the following requirements:

 - Using an M series chip (Apple silicon)
- Using a native Python >= 3.8
+- Using a native Python >= 3.9
 - macOS >= 13.5

 .. note::
@@ -209,7 +209,7 @@ Metal library by run-time compiling kernels the first time they are used in MLX
 on a given machine. Note run-time compilation incurs a cold-start cost which can
 be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
-Metal kernel cache persists accross reboots.
+Metal kernel cache persists across reboots.

 Troubleshooting
 ^^^^^^^^^^^^^^^
@@ -240,7 +240,7 @@ x86 Shell

 .. _build shell:

-If the ouptut of ``uname -p``  is ``x86`` then your shell is running as x86 via
+If the output of ``uname -p``  is ``x86`` then your shell is running as x86 via
 Rosetta instead of natively.

 To fix this, find the application in Finder (``/Applications`` for iTerm,
@@ -264,4 +264,4 @@ Also check that cmake is using the correct architecture:

 If you see ``"x86_64"``, try re-installing ``cmake``. If you see ``"arm64"``
 but the build errors out with "Building for x86_64 on macOS is not supported."
-wipe your build cahce with ``rm -rf build/`` and try again.
+wipe your build cache with ``rm -rf build/`` and try again.
--- a/docs/src/python/fast.rst
+++ b/docs/src/python/fast.rst
@@ -12,5 +12,4 @@ Fast
  layer_norm
  rope
  scaled_dot_product_attention
-  affine_quantize
  metal_kernel
--- a/docs/src/python/linalg.rst
+++ b/docs/src/python/linalg.rst
@@ -13,5 +13,8 @@ Linear Algebra
    norm
    cholesky
    cholesky_inv
+    cross
    qr
    svd
+    eigvalsh
+    eigh
--- a/docs/src/python/metal.rst
+++ b/docs/src/python/metal.rst
@@ -14,6 +14,7 @@ Metal
  get_cache_memory
  set_memory_limit
  set_cache_limit
+  set_wired_limit
  clear_cache
  start_capture
  stop_capture
--- a/docs/src/python/nn/functions.rst
+++ b/docs/src/python/nn/functions.rst
@@ -13,6 +13,7 @@ simple functions.
   :template: nn-module-template.rst

   elu
+   celu
   gelu
   gelu_approx
   gelu_fast_approx
--- a/docs/src/python/nn/layers.rst
+++ b/docs/src/python/nn/layers.rst
@@ -12,7 +12,9 @@ Layers
   ALiBi
   AvgPool1d
   AvgPool2d
+   AvgPool3d
   BatchNorm
+   CELU
   Conv1d
   Conv2d
   Conv3d
@@ -23,6 +25,7 @@ Layers
   Dropout2d
   Dropout3d
   Embedding
+   ELU
   GELU
   GLU
   GroupNorm
@@ -34,9 +37,12 @@ Layers
   LayerNorm
   LeakyReLU
   Linear
+   LogSigmoid
+   LogSoftmax
   LSTM
   MaxPool1d
   MaxPool2d
+   MaxPool3d
   Mish
   MultiHeadAttention
   PReLU
@@ -49,6 +55,7 @@ Layers
   RoPE
   SELU
   Sequential
+   Sigmoid
   SiLU
   SinusoidalPositionalEncoding
   Softmin
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -80,6 +80,7 @@ Operations
   greater_equal
   hadamard_transform
   identity
+   imag
   inner
   isfinite
   isclose
@@ -121,14 +122,17 @@ Operations
   pad
   power
   prod
+   put_along_axis
   quantize
   quantized_matmul
   radians
+   real
   reciprocal
   remainder
   repeat
   reshape
   right_shift
+   roll
   round
   rsqrt
   save
--- a/docs/src/python/random.rst
+++ b/docs/src/python/random.rst
@@ -45,3 +45,4 @@ we use a splittable version of Threefry, which is a counter-based PRNG.
   truncated_normal
   uniform
   laplace
+   permutation
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -33,12 +33,12 @@ Let's start with a simple example:
  # Compile the function
  compiled_fun = mx.compile(fun)

-  # Prints: array(2.36788, dtype=float32) 
+  # Prints: array(2.36788, dtype=float32)
  print(compiled_fun(x, y))

 The output of both the regular function and the compiled function is the same
 up to numerical precision.
-   
+
 The first time you call a compiled function, MLX will build the compute
 graph, optimize it, and generate and compile code. This can be relatively
 slow. However, MLX will cache compiled functions, so calling a compiled
@@ -96,7 +96,7 @@ element-wise operations:

 .. code-block:: python

-  def gelu(x):  
+  def gelu(x):
      return x * (1 + mx.erf(x / math.sqrt(2))) / 2

 If you use this function with small arrays, it will be overhead bound. If you
@@ -136,13 +136,6 @@ Now make an array, and benchmark both functions:
 On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
 five times faster.

-.. note::
-
-  As of the latest MLX, CPU functions are not fully compiled. Compiling CPU
-  functions can still be helpful, but won't typically result in as large a
-  speedup as compiling operations that run on the GPU.
-
-
 Debugging
 ---------

@@ -287,7 +280,7 @@ to the function. In some cases this can be pretty inconvenient. Hence,
  print(fun(mx.array(1.0)))


-Compiling Training Graphs 
+Compiling Training Graphs
 -------------------------

 This section will step through how to use :func:`compile` with a simple example
@@ -297,7 +290,7 @@ full forward, backward, and update with :func:`compile`.

 To start, here is the simple example without any compilation:

-.. code-block:: python 
+.. code-block:: python

  import mlx.core as mx
  import mlx.nn as nn
@@ -330,7 +323,7 @@ To start, here is the simple example without any compilation:
 To compile the update we can put it all in a function and compile it with the
 appropriate input and output captures. Here's the same example but compiled:

-.. code-block:: python 
+.. code-block:: python

  import mlx.core as mx
  import mlx.nn as nn
@@ -355,7 +348,7 @@ appropriate input and output captures. Here's the same example but compiled:

  # The state that will be captured as input and output
  state = [model.state, optimizer.state]
-      
+
  @partial(mx.compile, inputs=state, outputs=state)
  def step(x, y):
      loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
@@ -410,7 +403,7 @@ Compiling transformed functions works just as expected:

   In order to compile as much as possible, a transformation of a compiled
   function will not by default be compiled. To compile the transformed
-   function simply pass it through :func:`compile`. 
+   function simply pass it through :func:`compile`.

 You can also compile functions which themselves call compiled functions. A
 good practice is to compile the outer most function to give :func:`compile`
--- a/docs/src/usage/function_transforms.rst
+++ b/docs/src/usage/function_transforms.rst
@@ -25,7 +25,7 @@ Here is a simple example:

 The output of :func:`grad` on :func:`sin` is simply another function. In this
 case it is the gradient of the sine function which is exactly the cosine
-function. To get the second derivative you can do: 
+function. To get the second derivative you can do:

 .. code-block:: shell

@@ -50,7 +50,7 @@ Automatic Differentiation
 .. _auto diff:

 Automatic differentiation in MLX works on functions rather than on implicit
-graphs. 
+graphs.

 .. note::

@@ -114,7 +114,7 @@ way to do that is the following:

   def loss_fn(params, x, y):
      w, b = params["weight"], params["bias"]
-      h = w * x + b 
+      h = w * x + b
      return mx.mean(mx.square(h - y))

   params = {"weight": mx.array(1.0), "bias": mx.array(0.0)}
@@ -132,7 +132,7 @@ way to do that is the following:

 Notice the tree structure of the parameters is preserved in the gradients.

-In some cases you may want to stop gradients from propagating through a 
+In some cases you may want to stop gradients from propagating through a
 part of the function. You can use the :func:`stop_gradient` for that.


@@ -161,19 +161,19 @@ A naive way to add the elements from two sets of vectors is with a loop:
  ys = mx.random.uniform(shape=(100, 4096))

  def naive_add(xs, ys):
-      return [xs[i] + ys[:, i] for i in range(xs.shape[1])]
+      return [xs[i] + ys[:, i] for i in range(xs.shape[0])]

 Instead you can use :func:`vmap` to automatically vectorize the addition:

 .. code-block:: python
-   
+
   # Vectorize over the second dimension of x and the
   # first dimension of y
-   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(1, 0))
+   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(0, 1))

 The ``in_axes`` parameter can be used to specify which dimensions of the
 corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
-where the vectorized axes should be in the outputs. 
+where the vectorized axes should be in the outputs.

 Let's time these two different versions:

@@ -184,8 +184,8 @@ Let's time these two different versions:
  print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
  print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))

-On an M1 Max the naive version takes in total ``0.390`` seconds whereas the
-vectorized version takes only ``0.025`` seconds, more than ten times faster.
+On an M1 Max the naive version takes in total ``5.639`` seconds whereas the
+vectorized version takes only ``0.024`` seconds, more than 200 times faster.

 Of course, this operation is quite contrived. A better approach is to simply do
 ``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.
--- a/docs/src/usage/indexing.rst
+++ b/docs/src/usage/indexing.rst
@@ -51,7 +51,7 @@ You can also use an :obj:`array` to index another :obj:`array`:
 .. code-block:: shell

  >>> arr = mx.arange(10)
-  >>> idx = mx.array([5, 7]) 
+  >>> idx = mx.array([5, 7])
  >>> arr[idx]
  array([5, 7], dtype=int32)

@@ -77,12 +77,12 @@ from the GPU. Performing bounds checking for array indices before launching the
 kernel would be extremely inefficient.

 Indexing with boolean masks is something that MLX may support in the future. In
-general, MLX has limited support for operations for which outputs
+general, MLX has limited support for operations for which output
 *shapes* are dependent on input *data*. Other examples of these types of
 operations which MLX does not yet support include :func:`numpy.nonzero` and the
 single input version of :func:`numpy.where`.

-In Place Updates 
+In Place Updates
 ----------------

 In place updates to indexed arrays are possible in MLX. For example:
--- a/docs/src/usage/lazy_evaluation.rst
+++ b/docs/src/usage/lazy_evaluation.rst
@@ -13,7 +13,7 @@ compute graph is recorded. The actual computation only happens if an
 :func:`eval` is performed.

 MLX uses lazy evaluation because it has some nice features, some of which we
-describe below. 
+describe below.

 Transforming Compute Graphs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -109,14 +109,14 @@ Here is a concrete example:

 An important behavior to be aware of is when the graph will be implicitly
 evaluated. Anytime you ``print`` an array, convert it to an
-:obj:`numpy.ndarray`, or otherwise access it's memory via :obj:`memoryview`,
+:obj:`numpy.ndarray`, or otherwise access its memory via :obj:`memoryview`,
 the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
 saving functions) will also evaluate the array.


 Calling :func:`array.item` on a scalar array will also evaluate it. In the
 example above, printing the loss (``print(loss)``) or adding the loss scalar to
-a list (``losses.append(loss.item())``) would cause a graph evaluation. If 
+a list (``losses.append(loss.item())``) would cause a graph evaluation. If
 these lines are before ``mx.eval(loss, model.parameters())`` then this
 will be a partial evaluation, computing only the forward pass.

--- a/docs/src/usage/numpy.rst
+++ b/docs/src/usage/numpy.rst
@@ -3,10 +3,10 @@
 Conversion to NumPy and Other Frameworks
 ========================================

-MLX array supports conversion between other frameworks with either:  
+MLX array supports conversion between other frameworks with either:

-* The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_. 
-* `DLPack <https://dmlc.github.io/dlpack/latest/>`_.  
+* The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
+* `DLPack <https://dmlc.github.io/dlpack/latest/>`_.

 Let's convert an array to NumPy and back.

@@ -66,7 +66,7 @@ even though no in-place operations on MLX memory are executed.
 PyTorch
 -------

-.. warning:: 
+.. warning::

   PyTorch Support for :obj:`memoryview` is experimental and can break for
   multi-dimensional arrays. Casting to NumPy first is advised for now.
--- a/docs/src/usage/quick_start.rst
+++ b/docs/src/usage/quick_start.rst
@@ -64,4 +64,4 @@ Other gradient transformations include :func:`vjp` for vector-Jacobian products
 and :func:`jvp` for Jacobian-vector products.

 Use :func:`value_and_grad` to efficiently compute both a function's output and
-gradient with respect to the function's input. 
+gradient with respect to the function's input.
--- a/docs/src/usage/saving_and_loading.rst
+++ b/docs/src/usage/saving_and_loading.rst
@@ -8,33 +8,33 @@ Saving and Loading Arrays
 MLX supports multiple array serialization formats.

 .. list-table:: Serialization Formats
-   :widths: 20 8 25 25 
+   :widths: 20 8 25 25
   :header-rows: 1

-   * - Format 
-     - Extension 
+   * - Format
+     - Extension
     - Function
-     - Notes 
-   * - NumPy 
-     - ``.npy`` 
+     - Notes
+   * - NumPy
+     - ``.npy``
     - :func:`save`
     - Single arrays only
-   * - NumPy archive 
-     - ``.npz`` 
+   * - NumPy archive
+     - ``.npz``
     - :func:`savez` and :func:`savez_compressed`
-     - Multiple arrays 
+     - Multiple arrays
   * - Safetensors
-     - ``.safetensors`` 
+     - ``.safetensors``
     - :func:`save_safetensors`
-     - Multiple arrays 
-   * - GGUF 
-     - ``.gguf`` 
+     - Multiple arrays
+   * - GGUF
+     - ``.gguf``
     - :func:`save_gguf`
     - Multiple arrays

 The :func:`load` function will load any of the supported serialization
 formats. It determines the format from the extensions. The output of
-:func:`load` depends on the format. 
+:func:`load` depends on the format.

 Here's an example of saving a single array to a file:

--- a/docs/src/usage/unified_memory.rst
+++ b/docs/src/usage/unified_memory.rst
@@ -20,7 +20,7 @@ Both ``a`` and ``b`` live in unified memory.

 In MLX, rather than moving arrays to devices, you specify the device when you
 run the operation. Any device can perform any operation on ``a`` and ``b``
-without needing to move them from one memory location to another. For example: 
+without needing to move them from one memory location to another. For example:

 .. code-block:: python

--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -11,10 +11,14 @@ option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)

 # ----------------------------- Dependencies -----------------------------
 find_package(MLX CONFIG REQUIRED)
-find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(
+  Python 3.8
+  COMPONENTS Interpreter Development.Module
+  REQUIRED)
 execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE NB_DIR)
 list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
 find_package(nanobind CONFIG REQUIRED)

@@ -24,16 +28,10 @@ find_package(nanobind CONFIG REQUIRED)
 add_library(mlx_ext)

 # Add sources
-target_sources(
-  mlx_ext
-  PUBLIC
-  ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp
-)
+target_sources(mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp)

 # Add include headers
-target_include_directories(
-  mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}
-)
+target_include_directories(mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR})

 # Link to mlx
 target_link_libraries(mlx_ext PUBLIC mlx)
@@ -43,27 +41,32 @@ target_link_libraries(mlx_ext PUBLIC mlx)
 # Build metallib
 if(MLX_BUILD_METAL)
  mlx_build_metallib(
-    TARGET mlx_ext_metallib
-    TITLE mlx_ext
-    SOURCES ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
-    INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${MLX_INCLUDE_DIRS}
-    OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-  )
-
-  add_dependencies(
-    mlx_ext
+    TARGET
    mlx_ext_metallib
-  )
+    TITLE
+    mlx_ext
+    SOURCES
+    ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
+    INCLUDE_DIRS
+    ${PROJECT_SOURCE_DIR}
+    ${MLX_INCLUDE_DIRS}
+    OUTPUT_DIRECTORY
+    ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+  add_dependencies(mlx_ext mlx_ext_metallib)

 endif()

 # ----------------------------- Python Bindings -----------------------------
 nanobind_add_module(
  _ext
-  NB_STATIC STABLE_ABI LTO NOMINSIZE
-  NB_DOMAIN mlx 
-  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp
-)
+  NB_STATIC
+  STABLE_ABI
+  LTO
+  NOMINSIZE
+  NB_DOMAIN
+  mlx
+  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
 target_link_libraries(_ext PRIVATE mlx_ext)

 if(BUILD_SHARED_LIBS)
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -257,7 +257,7 @@ void Axpby::eval_gpu(

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // Kernel parameters are registered with buffer indices corresponding to
  // those in the kernel declaration at axpby.metal
@@ -272,15 +272,15 @@ void Axpby::eval_gpu(
  compute_encoder.set_output_array(out, 2);

  // Encode alpha and beta
-  compute_encoder->setBytes(&alpha_, sizeof(float), 3);
-  compute_encoder->setBytes(&beta_, sizeof(float), 4);
+  compute_encoder.set_bytes(alpha_, 3);
+  compute_encoder.set_bytes(beta_, 4);

  // Encode shape, strides and ndim if needed
  if (!contiguous_kernel) {
-    compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
-    compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
-    compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
-    compute_encoder->setBytes(&ndim, sizeof(int), 8);
+    compute_encoder.set_vector_bytes(x.shape(), 5);
+    compute_encoder.set_vector_bytes(x.strides(), 6);
+    compute_encoder.set_bytes(y.strides(), 7);
+    compute_encoder.set_bytes(ndim, 8);
  }

  // We launch 1 thread for each input and make sure that the number of
@@ -295,7 +295,7 @@ void Axpby::eval_gpu(

  // Launch the grid with the given number of threads divided among
  // the given threadgroups
-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }

 #else // Metal is not available
--- a/examples/extensions/axpby/axpby.metal
+++ b/examples/extensions/axpby/axpby.metal
@@ -2,7 +2,6 @@

 #include <metal_stdlib>

-#include "mlx/backend/metal/kernels/bf16.h"
 #include "mlx/backend/metal/kernels/utils.h"

 template <typename T>
@@ -13,8 +12,8 @@ template <typename T>
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    constant const int* shape [[buffer(5)]],
-    constant const size_t* x_strides [[buffer(6)]],
-    constant const size_t* y_strides [[buffer(7)]],
+    constant const int64_t* x_strides [[buffer(6)]],
+    constant const int64_t* y_strides [[buffer(7)]],
    constant const int& ndim [[buffer(8)]],
    uint index [[thread_position_in_grid]]) {
  auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
@@ -35,29 +34,14 @@ template <typename T>
      static_cast<T>(alpha) * x[index] + static_cast<T>(beta) * y[index];
 }

-#define instantiate_axpby(type_name, type)                               \
-  template [[host_name("axpby_general_" #type_name)]] [[kernel]] void    \
-  axpby_general<type>(                                                   \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      constant const int* shape [[buffer(5)]],                           \
-      constant const size_t* x_strides [[buffer(6)]],                    \
-      constant const size_t* y_strides [[buffer(7)]],                    \
-      constant const int& ndim [[buffer(8)]],                            \
-      uint index [[thread_position_in_grid]]);                           \
-  template [[host_name("axpby_contiguous_" #type_name)]] [[kernel]] void \
-  axpby_contiguous<type>(                                                \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      uint index [[thread_position_in_grid]]);
+// clang-format off
+#define instantiate_axpby(type_name, type)                             \
+  instantiate_kernel("axpby_general_" #type_name, axpby_general, type) \
+  instantiate_kernel(                                                  \
+          "axpby_contiguous_" #type_name, axpby_contiguous, type)

 instantiate_axpby(float32, float);
 instantiate_axpby(float16, half);
 instantiate_axpby(bfloat16, bfloat16_t);
-instantiate_axpby(complex64, complex64_t);
+instantiate_axpby(complex64, complex64_t);
+// clang-format on
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
  "setuptools>=42",
  "cmake>=3.24",
-  "mlx>=0.17.0",
-  "nanobind==2.1.0",
+  "mlx>=0.18.0",
+  "nanobind==2.2.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
 cmake>=3.24
-mlx>=0.17.0
-nanobind==2.1.0
+mlx>=0.21.0
+nanobind==2.2.0
--- a/mlx.pc.in
+++ b/mlx.pc.in
@@ -28,10 +28,19 @@ endif()
 if (@MLX_BUILD_METAL@)
    set(MLX_BUILD_METAL @MLX_BUILD_METAL@)
    set(MLX_CXX_FLAGS ${MLX_CXX_FLAGS} -D_METAL_)
-    set_and_check(MLX_INCLUDE_DIRS 
-        ${MLX_INCLUDE_DIRS} 
+    set(MLX_INCLUDE_DIRS 
+        "${MLX_INCLUDE_DIRS};"
        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/metal_cpp
    )
+    if(@MLX_METAL_VERSION@ GREATER_EQUAL 310)
+      set(MLX_INCLUDE_DIRS
+        "${MLX_INCLUDE_DIRS};"
+        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/mlx/backend/metal/kernels/metal_3_1)
+    else()
+      set(MLX_INCLUDE_DIRS
+        "${MLX_INCLUDE_DIRS};"
+        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/mlx/backend/metal/kernels/metal_3_0)
+    endif()
 endif()

 set_target_properties(mlx PROPERTIES
@@ -40,4 +49,4 @@ set_target_properties(mlx PROPERTIES
 )

 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(MLX DEFAULT_MSG MLX_LIBRARY MLX_INCLUDE_DIRS)
+find_package_handle_standard_args(MLX DEFAULT_MSG MLX_LIBRARY MLX_INCLUDE_DIRS)
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -1,26 +1,24 @@
 target_sources(
  mlx
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/graph_utils.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/random.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h
-)
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/graph_utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/random.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

-if (MLX_BUILD_CPU)
+if(MLX_BUILD_CPU)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
@@ -28,17 +26,15 @@ endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
-if (MLX_BUILD_ACCELERATE)
+if(MLX_BUILD_ACCELERATE)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
 elseif(MLX_BUILD_CPU)
  target_sources(
    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/backend/common/default_primitives.cpp
-  )
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/common/default_primitives.cpp)
 endif()

-if (MLX_BUILD_METAL)
+if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_metal)
--- a/mlx/allocator.cpp
+++ b/mlx/allocator.cpp
@@ -19,7 +19,7 @@ Buffer malloc(size_t size) {
 }

 void free(Buffer buffer) {
-  return allocator().free(buffer);
+  allocator().free(buffer);
 }

 Buffer CommonAllocator::malloc(size_t size, bool) {
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <functional>
+#include <unordered_map>

 #include "mlx/array.h"
 #include "mlx/ops.h"
@@ -30,7 +31,7 @@ array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
 }

 array::array(
-    std::vector<int> shape,
+    Shape shape,
    Dtype dtype,
    std::shared_ptr<Primitive> primitive,
    std::vector<array> inputs)
@@ -41,7 +42,7 @@ array::array(
          std::move(inputs))) {}

 std::vector<array> array::make_arrays(
-    std::vector<std::vector<int>> shapes,
+    std::vector<Shape> shapes,
    const std::vector<Dtype>& dtypes,
    const std::shared_ptr<Primitive>& primitive,
    const std::vector<array>& inputs) {
@@ -73,11 +74,7 @@ array::array(std::initializer_list<int> data, Dtype dtype)
 }

 /* Build an array from a shared buffer */
-array::array(
-    allocator::Buffer data,
-    std::vector<int> shape,
-    Dtype dtype,
-    deleter_t deleter)
+array::array(allocator::Buffer data, Shape shape, Dtype dtype, Deleter deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  set_data(data, deleter);
 }
@@ -95,13 +92,29 @@ void array::detach() {
  array_desc_->primitive = nullptr;
 }

-void array::eval() {
-  // Ensure the array is ready to be read
-  if (status() == Status::scheduled) {
+bool array::is_available() const {
+  if (status() == Status::available) {
+    return true;
+  } else if (status() == Status::evaluated && event().is_signaled()) {
+    set_status(Status::available);
+    return true;
+  }
+  return false;
+}
+
+void array::wait() {
+  if (!is_available()) {
    event().wait();
    set_status(Status::available);
-  } else if (status() == Status::unscheduled) {
+  }
+}
+
+void array::eval() {
+  // Ensure the array is ready to be read
+  if (status() == Status::unscheduled) {
    mlx::core::eval({*this});
+  } else {
+    wait();
  }
 }

@@ -109,7 +122,7 @@ bool array::is_tracer() const {
  return array_desc_->is_tracer && in_tracing() || retain_graph();
 }

-void array::set_data(allocator::Buffer buffer, deleter_t d) {
+void array::set_data(allocator::Buffer buffer, Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
  array_desc_->data_ptr = buffer.raw_ptr();
  array_desc_->data_size = size();
@@ -122,9 +135,9 @@ void array::set_data(allocator::Buffer buffer, deleter_t d) {
 void array::set_data(
    allocator::Buffer buffer,
    size_t data_size,
-    std::vector<size_t> strides,
+    Strides strides,
    Flags flags,
-    deleter_t d) {
+    Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
  array_desc_->data_ptr = buffer.raw_ptr();
  array_desc_->data_size = data_size;
@@ -134,7 +147,7 @@ void array::set_data(

 void array::copy_shared_buffer(
    const array& other,
-    const std::vector<size_t>& strides,
+    const Strides& strides,
    Flags flags,
    size_t data_size,
    size_t offset /* = 0 */) {
@@ -153,7 +166,7 @@ void array::copy_shared_buffer(const array& other) {

 void array::move_shared_buffer(
    array other,
-    const std::vector<size_t>& strides,
+    const Strides& strides,
    Flags flags,
    size_t data_size,
    size_t offset /* = 0 */) {
@@ -162,8 +175,10 @@ void array::move_shared_buffer(
  array_desc_->flags = flags;
  array_desc_->data_size = data_size;
  auto char_offset = sizeof(char) * itemsize() * offset;
-  array_desc_->data_ptr = static_cast<void*>(
-      static_cast<char*>(other.array_desc_->data_ptr) + char_offset);
+  auto data_ptr = other.array_desc_->data_ptr;
+  other.array_desc_->data_ptr = nullptr;
+  array_desc_->data_ptr =
+      static_cast<void*>(static_cast<char*>(data_ptr) + char_offset);
 }

 void array::move_shared_buffer(array other) {
@@ -196,6 +211,8 @@ array::~array() {
    if (do_detach) {
      for (auto& s : siblings()) {
        for (auto& ss : s.siblings()) {
+          // Set to null here to avoid descending into array destructor
+          // for siblings
          ss.array_desc_ = nullptr;
        }
        s.array_desc_->siblings.clear();
@@ -216,13 +233,13 @@ void array::ArrayDesc::init() {
  }
 }

-array::ArrayDesc::ArrayDesc(std::vector<int> shape, Dtype dtype)
+array::ArrayDesc::ArrayDesc(Shape shape, Dtype dtype)
    : shape(std::move(shape)), dtype(dtype), status(Status::available) {
  init();
 }

 array::ArrayDesc::ArrayDesc(
-    std::vector<int> shape,
+    Shape shape,
    Dtype dtype,
    std::shared_ptr<Primitive> primitive,
    std::vector<array> inputs)
@@ -242,25 +259,46 @@ array::ArrayDesc::~ArrayDesc() {
  // This calls recursively the destructor and can result in stack overflow, we
  // instead put them in a vector and destroy them one at a time resulting in a
  // max stack depth of 2.
+  if (inputs.empty()) {
+    return;
+  }
+
  std::vector<std::shared_ptr<ArrayDesc>> for_deletion;

-  for (array& a : inputs) {
-    if (a.array_desc_.use_count() == 1) {
-      for_deletion.push_back(std::move(a.array_desc_));
+  auto append_deletable_inputs = [&for_deletion](ArrayDesc& ad) {
+    std::unordered_map<std::uintptr_t, array> input_map;
+    for (array& a : ad.inputs) {
+      if (a.array_desc_) {
+        input_map.insert({a.id(), a});
+        for (auto& s : a.siblings()) {
+          input_map.insert({s.id(), s});
+        }
+      }
    }
-  }
+    ad.inputs.clear();
+    for (auto& [_, a] : input_map) {
+      if (a.array_desc_.use_count() <= a.siblings().size() + 1) {
+        for_deletion.push_back(std::move(a.array_desc_));
+      }
+    }
+  };
+
+  append_deletable_inputs(*this);

  while (!for_deletion.empty()) {
    // top is going to be deleted at the end of the block *after* the arrays
    // with inputs have been moved into the vector
    auto top = std::move(for_deletion.back());
    for_deletion.pop_back();
+    append_deletable_inputs(*top);

-    for (array& a : top->inputs) {
-      if (a.array_desc_.use_count() == 1) {
-        for_deletion.push_back(std::move(a.array_desc_));
-      }
+    // Clear out possible siblings to break circular references
+    for (auto& s : top->siblings) {
+      // Set to null here to avoid descending into top-level
+      // array destructor for siblings
+      s.array_desc_ = nullptr;
    }
+    top->siblings.clear();
  }
 }

--- a/mlx/array.h
+++ b/mlx/array.h
@@ -15,7 +15,10 @@ namespace mlx::core {

 // Forward declaration
 class Primitive;
-using deleter_t = std::function<void(allocator::Buffer)>;
+
+using Deleter = std::function<void(allocator::Buffer)>;
+using Shape = std::vector<int32_t>;
+using Strides = std::vector<int64_t>;

 class array {
  /* An array is really a node in a graph. It contains a shared ArrayDesc
@@ -33,7 +36,7 @@ class array {
  template <typename It>
  array(
      It data,
-      std::vector<int> shape,
+      Shape shape,
      Dtype dtype =
          TypeToDtype<typename std::iterator_traits<It>::value_type>());

@@ -49,15 +52,15 @@ class array {
  template <typename T>
  array(
      std::initializer_list<T> data,
-      std::vector<int> shape,
+      Shape shape,
      Dtype dtype = TypeToDtype<T>());

  /* Build an array from a buffer */
  array(
      allocator::Buffer data,
-      std::vector<int> shape,
+      Shape shape,
      Dtype dtype,
-      deleter_t deleter = allocator::free);
+      Deleter deleter = allocator::free);

  /** Assignment to rvalue does not compile. */
  array& operator=(const array& other) && = delete;
@@ -96,7 +99,7 @@ class array {
  }

  /** The shape of the array as a vector of integers. */
-  const std::vector<int>& shape() const {
+  const Shape& shape() const {
    return array_desc_->shape;
  }

@@ -105,12 +108,12 @@ class array {
   *
   *  This function supports negative indexing and provides
   *  bounds checking. */
-  int shape(int dim) const {
+  auto shape(int dim) const {
    return shape().at(dim < 0 ? dim + ndim() : dim);
  }

  /** The strides of the array. */
-  const std::vector<size_t>& strides() const {
+  const Strides& strides() const {
    return array_desc_->strides;
  }

@@ -119,7 +122,7 @@ class array {
   *
   *  This function supports negative indexing and provides
   *  bounds checking. */
-  size_t strides(int dim) const {
+  auto strides(int dim) const {
    return strides().at(dim < 0 ? dim + ndim() : dim);
  }

@@ -184,13 +187,13 @@ class array {
   */

  array(
-      std::vector<int> shape,
+      Shape shape,
      Dtype dtype,
      std::shared_ptr<Primitive> primitive,
      std::vector<array> inputs);

  static std::vector<array> make_arrays(
-      std::vector<std::vector<int>> shapes,
+      std::vector<Shape> shapes,
      const std::vector<Dtype>& dtypes,
      const std::shared_ptr<Primitive>& primitive,
      const std::vector<array>& inputs);
@@ -207,8 +210,8 @@ class array {

  struct Data {
    allocator::Buffer buffer;
-    deleter_t d;
-    Data(allocator::Buffer buffer, deleter_t d = allocator::free)
+    Deleter d;
+    Data(allocator::Buffer buffer, Deleter d = allocator::free)
        : buffer(buffer), d(d) {}
    // Not copyable
    Data(const Data& d) = delete;
@@ -344,11 +347,33 @@ class array {
    return static_cast<T*>(array_desc_->data_ptr);
  }

-  enum Status { unscheduled, scheduled, available };
+  enum Status {
+    // The ouptut of a computation which has not been scheduled.
+    // For example, the status of `x` in `auto x = a + b`.
+    unscheduled,

-  bool is_available() const {
-    return status() == Status::available;
-  }
+    // The ouptut of a computation which has been scheduled but `eval_*` has
+    // not yet been called on the array's primitive. A possible
+    // status of `x` in `auto x = a + b; eval(x);`
+    scheduled,
+
+    // The array's `eval_*` function has been run, but the computation is not
+    // necessarily complete. The array will have memory allocated and if it is
+    // not a tracer then it will be detached from the graph.
+    evaluated,
+
+    // If the array is the output of a computation then the computation
+    // is complete. Constant arrays are always available (e.g. `array({1, 2,
+    // 3})`)
+    available
+  };
+
+  // Check if the array is safe to read.
+  bool is_available() const;
+
+  // Wait on the array to be available. After this `is_available` returns
+  // `true`.
+  void wait();

  Status status() const {
    return array_desc_->status;
@@ -375,18 +400,18 @@ class array {
  // Check if the array is a tracer array
  bool is_tracer() const;

-  void set_data(allocator::Buffer buffer, deleter_t d = allocator::free);
+  void set_data(allocator::Buffer buffer, Deleter d = allocator::free);

  void set_data(
      allocator::Buffer buffer,
      size_t data_size,
-      std::vector<size_t> strides,
+      Strides strides,
      Flags flags,
-      deleter_t d = allocator::free);
+      Deleter d = allocator::free);

  void copy_shared_buffer(
      const array& other,
-      const std::vector<size_t>& strides,
+      const Strides& strides,
      Flags flags,
      size_t data_size,
      size_t offset = 0);
@@ -395,7 +420,7 @@ class array {

  void move_shared_buffer(
      array other,
-      const std::vector<size_t>& strides,
+      const Strides& strides,
      Flags flags,
      size_t data_size,
      size_t offset = 0);
@@ -414,8 +439,8 @@ class array {
  void init(const It src);

  struct ArrayDesc {
-    std::vector<int> shape;
-    std::vector<size_t> strides;
+    Shape shape;
+    Strides strides;
    size_t size;
    Dtype dtype;
    std::shared_ptr<Primitive> primitive;
@@ -449,10 +474,10 @@ class array {
    // The arrays position in the output list
    uint32_t position{0};

-    explicit ArrayDesc(std::vector<int> shape, Dtype dtype);
+    explicit ArrayDesc(Shape shape, Dtype dtype);

    explicit ArrayDesc(
-        std::vector<int> shape,
+        Shape shape,
        Dtype dtype,
        std::shared_ptr<Primitive> primitive,
        std::vector<array> inputs);
@@ -480,7 +505,7 @@ array::array(T val, Dtype dtype /* = TypeToDtype<T>() */)
 template <typename It>
 array::array(
  It data,
-  std::vector<int> shape,
+  Shape shape,
  Dtype dtype /* = TypeToDtype<typename std::iterator_traits<It>::value_type>() */) :
    array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  init(data);
@@ -499,7 +524,7 @@ array::array(
 template <typename T>
 array::array(
    std::initializer_list<T> data,
-    std::vector<int> shape,
+    Shape shape,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  if (data.size() != size()) {
--- a/mlx/backend/accelerate/CMakeLists.txt
+++ b/mlx/backend/accelerate/CMakeLists.txt
@@ -1,10 +1,8 @@
 target_sources(
  mlx
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
-)
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp)
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -81,6 +81,7 @@ DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)

 void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
--- a/mlx/backend/accelerate/quantized.cpp
+++ b/mlx/backend/accelerate/quantized.cpp
@@ -18,49 +18,61 @@ void _qmm_t_4_64(
    const float* biases,
    int M,
    int N,
-    int K) {
+    int K,
+    int B,
+    bool batched_w) {
  constexpr int bits = 4;
  constexpr int group_size = 64;
  constexpr int bitmask = (1 << bits) - 1;
  constexpr int pack_factor = 32 / bits;
  constexpr int packs_in_group = group_size / pack_factor;

-  for (int m = 0; m < M; m++) {
-    const uint32_t* w_local = w;
-    const float* scales_local = scales;
-    const float* biases_local = biases;
+  int w_els = N * K / pack_factor;
+  int g_els = w_els * pack_factor / group_size;

-    for (int n = 0; n < N; n++) {
-      const simd_float16* x_local = (simd_float16*)x;
-      simd_float16 sum = 0;
-      for (int k = 0; k < K; k += group_size) {
-        float scale = *scales_local++;
-        float bias = *biases_local++;
+  for (int i = 0; i < B; i++) {
+    for (int m = 0; m < M; m++) {
+      const uint32_t* w_local = w;
+      const float* scales_local = scales;
+      const float* biases_local = biases;

-        for (int kw = 0; kw < packs_in_group; kw += 2) {
-          // TODO: vectorize this properly
-          simd_uint16 wi;
-          for (int e = 0; e < 2; e++) {
-            uint32_t wii = *w_local++;
-            for (int p = 0; p < 8; p++) {
-              wi[e * 8 + p] = wii & bitmask;
-              wii >>= bits;
+      for (int n = 0; n < N; n++) {
+        const simd_float16* x_local = (simd_float16*)x;
+        simd_float16 sum = 0;
+        for (int k = 0; k < K; k += group_size) {
+          float scale = *scales_local++;
+          float bias = *biases_local++;
+
+          for (int kw = 0; kw < packs_in_group; kw += 2) {
+            // TODO: vectorize this properly
+            simd_uint16 wi;
+            for (int e = 0; e < 2; e++) {
+              uint32_t wii = *w_local++;
+              for (int p = 0; p < 8; p++) {
+                wi[e * 8 + p] = wii & bitmask;
+                wii >>= bits;
+              }
            }
-          }
-          simd_float16 wf = simd_float(wi);
-          wf *= scale;
-          wf += bias;
+            simd_float16 wf = simd_float(wi);
+            wf *= scale;
+            wf += bias;

-          sum += (*x_local) * wf;
-          x_local++;
+            sum += (*x_local) * wf;
+            x_local++;
+          }
        }
+
+        *result = simd_reduce_add(sum);
+        result++;
      }

-      *result = simd_reduce_add(sum);
-      result++;
+      x += K;
+    }
+    if (batched_w) {
+      w += w_els;
+      scales += g_els;
+      biases += g_els;
    }
-
-    x += K;
  }
 }

@@ -82,8 +94,10 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (condition) {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
    int K = x.shape(-1);
-    int M = x.size() / K;
+    int M = x.shape(-2);
    int N = out.shape(-1);
+    int B = x.size() / K / M;
+    bool batched_w = w.ndim() > 2;
    _qmm_t_4_64(
        out.data<float>(),
        x.data<float>(),
@@ -92,7 +106,9 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
        biases.data<float>(),
        M,
        N,
-        K);
+        K,
+        B,
+        batched_w);
  } else {
    eval(inputs, out);
  }
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -33,8 +33,8 @@ namespace {
 * Note: The implementation below is a general fast exp. There could be faster
 *       implementations for numbers strictly < 0.
 */
-inline simd_float16 simd_fast_exp(simd_float16 x) {
-  x *= 1.442695; // multiply with log_2(e)
+inline simd_float16 simd_fast_exp(simd_float16 x_init) {
+  auto x = x_init * 1.442695; // multiply with log_2(e)
  simd_float16 ipart, fpart;
  simd_int16 epart;
  x = simd_clamp(x, -80, 80);
@@ -53,7 +53,9 @@ inline simd_float16 simd_fast_exp(simd_float16 x) {
  // bitshifting
  epart = (simd_int(ipart) + 127) << 23;

-  return (*(simd_float16*)&epart) * x;
+  // Avoid supressing NaNs
+  simd_int16 eq = (x_init == x_init);
+  return simd_bitselect(x_init, (*(simd_float16*)&epart) * x, eq);
 }

 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -1,5 +1,4 @@
-
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  set(COMPILER ${CMAKE_C_COMPILER})
  set(CLANG TRUE)
 else()
@@ -7,72 +6,57 @@ else()
 endif()

 add_custom_command(
-    OUTPUT  compiled_preamble.cpp
-    COMMAND /bin/bash
-              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
-              ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
-              ${COMPILER}
-              ${PROJECT_SOURCE_DIR}
-              ${CLANG}
+  OUTPUT compiled_preamble.cpp
+  COMMAND
+    /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
+    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
+    ${PROJECT_SOURCE_DIR} ${CLANG}
+  DEPENDS make_compiled_preamble.sh
+          compiled_preamble.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
+          ops.h)

-    DEPENDS make_compiled_preamble.sh
-            compiled_preamble.h
-            ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
-            ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
-            ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
-            ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
-            ops.h
-)
-
-add_custom_target(
-  cpu_compiled_preamble
-  DEPENDS compiled_preamble.cpp
-)
+add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)

 add_dependencies(mlx cpu_compiled_preamble)

 target_sources(
  mlx
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
-  ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp
-)
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)

-if (IOS)
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/compiled_nocpu.cpp
-  )
+if(IOS)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_nocpu.cpp)
 else()
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp
-  )
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp)
 endif()
--- a/mlx/backend/common/arg_reduce.cpp
+++ b/mlx/backend/common/arg_reduce.cpp
@@ -13,8 +13,8 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  std::vector<size_t> strides = in.strides();
-  std::vector<int> shape = in.shape();
+  Strides strides = in.strides();
+  Shape shape = in.shape();
  strides.erase(strides.begin() + axis);
  shape.erase(shape.begin() + axis);
  for (uint32_t i = 0; i < out.size(); ++i) {
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -122,19 +122,7 @@ void set_binary_op_output_data(
  }
 }

-struct UseDefaultBinaryOp {
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    // Should we throw? This should normally never be called.
-    assert(false);
-  }
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
-    // Should we throw? This should normally never be called.
-    assert(false);
-  }
-};
+struct UseDefaultBinaryOp {};

 template <typename T, typename U, typename Op>
 struct DefaultVectorScalar {
@@ -150,18 +138,6 @@ struct DefaultVectorScalar {
      a++;
    }
  }
-
-  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
-    T scalar = *b;
-    while (size-- > 0) {
-      auto dst = op(*a, scalar);
-      *dst_a = dst.first;
-      *dst_b = dst.second;
-      dst_a++;
-      dst_b++;
-      a++;
-    }
-  }
 };

 template <typename T, typename U, typename Op>
@@ -178,18 +154,6 @@ struct DefaultScalarVector {
      b++;
    }
  }
-
-  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
-    T scalar = *a;
-    while (size-- > 0) {
-      auto dst = op(scalar, *b);
-      *dst_a = dst.first;
-      *dst_b = dst.second;
-      dst_a++;
-      dst_b++;
-      b++;
-    }
-  }
 };

 template <typename T, typename U, typename Op>
@@ -206,204 +170,110 @@ struct DefaultVectorVector {
      b++;
    }
  }
-
-  void operator()(const T* a, const T* b, U* dst_a, U* dst_b, int size) {
-    while (size-- > 0) {
-      auto dst = op(*a, *b);
-      *dst_a = dst.first;
-      *dst_b = dst.second;
-      dst_a++;
-      dst_b++;
-      a++;
-      b++;
-    }
-  }
 };

-template <typename T, typename U, typename Op>
-void binary_op_dims1(const array& a, const array& b, array& out, Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < out.size(); ++i) {
-    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx]);
-    a_idx += a.strides()[0];
-    b_idx += b.strides()[0];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims1(
-    const array& a,
-    const array& b,
-    array& out,
+template <typename T, typename U, typename Op, int D, bool Strided>
+void binary_op_dims(
+    const T* a,
+    const T* b,
+    U* out,
    Op op,
-    int stride) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; i++) {
-    op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);
-    a_idx += a.strides()[0];
-    b_idx += b.strides()[0];
-    dst += stride;
-  }
-}
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];

-template <typename T, typename U, typename Op>
-void binary_op_dims2(const array& a, const array& b, array& out, Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);
-      a_idx += a.strides()[1];
-      b_idx += b.strides()[1];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims2(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    int stride) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);
-      a_idx += a.strides()[1];
-      b_idx += b.strides()[1];
-      dst += stride;
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims3(const array& a, const array& b, array& out, Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);
-        a_idx += a.strides()[2];
-        b_idx += b.strides()[2];
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      binary_op_dims<T, U, Op, D - 1, Strided>(
+          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
+    } else {
+      if constexpr (Strided) {
+        op(a, b, out, stride_out);
+      } else {
+        *out = op(*a, *b);
      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+    out += stride_out;
+    a += stride_a;
+    b += stride_b;
  }
 }

-template <typename T, typename U, typename Op>
-void binary_op_dims4(const array& a, const array& b, array& out, Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {
-          dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx]);
-          a_idx += a.strides()[3];
-          b_idx += b.strides()[3];
-        }
-        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];
-        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];
-      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dispatch_dims(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op) {
-  switch (out.ndim()) {
-    case 1:
-      binary_op_dims1<T, U, Op>(a, b, out, op);
-      return;
-    case 2:
-      binary_op_dims2<T, U, Op>(a, b, out, op);
-      return;
-    case 3:
-      binary_op_dims3<T, U, Op>(a, b, out, op);
-      return;
-    case 4:
-      binary_op_dims4<T, U, Op>(a, b, out, op);
-      return;
-  }
-
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  for (size_t i = 0; i < out.size(); i++) {
-    int a_idx = elem_to_loc(i, a.shape(), a.strides());
-    int b_idx = elem_to_loc(i, b.shape(), b.strides());
-    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx]);
-  }
-}
-
-template <typename T, typename U, typename Op>
+template <typename T, typename U, bool Strided, typename Op>
 void binary_op_dispatch_dims(
    const array& a,
    const array& b,
    array& out,
    Op op,
    int dim,
-    int stride) {
-  // Number of dimensions to loop over for vectorized ops
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* out_ptr = out.data<U>();
  switch (dim) {
    case 1:
-      binary_op_dims1<T, U, Op>(a, b, out, op, stride);
+      binary_op_dims<T, U, Op, 1, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
      return;
    case 2:
-      binary_op_dims2<T, U, Op>(a, b, out, op, stride);
+      binary_op_dims<T, U, Op, 2, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+    case 3:
+      binary_op_dims<T, U, Op, 3, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
      return;
  }

-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst = out.data<U>();
-  for (size_t i = 0; i < out.size(); i += stride) {
-    int a_idx = elem_to_loc(i, a.shape(), a.strides());
-    int b_idx = elem_to_loc(i, b.shape(), b.strides());
-    op(a_ptr + a_idx, b_ptr + b_idx, dst, stride);
-    dst += stride;
+  ContiguousIterator a_it(shape, a_strides, dim - 3);
+  ContiguousIterator b_it(shape, b_strides, dim - 3);
+  auto stride = out_strides[dim - 4];
+  for (int64_t elem = 0; elem < a.size(); elem += stride) {
+    binary_op_dims<T, U, Op, 3, Strided>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        out_strides,
+        dim - 3);
+    a_it.step();
+    b_it.step();
  }
 }

@@ -450,29 +320,33 @@ void binary_op(
  }

  // General computation so let's try to optimize
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), out.strides()});
+  const auto& a_strides = new_strides[0];
+  const auto& b_strides = new_strides[1];
+  const auto& strides = new_strides[2];

  // Get the left-most dim such that the array is row contiguous after
-  auto& strides = out.strides();
-  auto leftmost_rc_dim = [&strides](const array& arr) {
-    int d = arr.ndim() - 1;
-    for (; d >= 0 && arr.strides()[d] == strides[d]; d--) {
+  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
    }
    return d + 1;
  };
-  auto a_rc_dim = leftmost_rc_dim(a);
-  auto b_rc_dim = leftmost_rc_dim(b);
+  auto a_rc_dim = leftmost_rc_dim(a_strides);
+  auto b_rc_dim = leftmost_rc_dim(b_strides);

  // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const array& arr) {
-    int d = arr.ndim() - 1;
-    for (; d >= 0 && arr.strides()[d] == 0; d--) {
+  auto leftmost_s_dim = [](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == 0; d--) {
    }
    return d + 1;
  };
-  auto a_s_dim = leftmost_s_dim(a);
-  auto b_s_dim = leftmost_s_dim(b);
+  auto a_s_dim = leftmost_s_dim(a_strides);
+  auto b_s_dim = leftmost_s_dim(b_strides);

-  auto ndim = out.ndim();
+  auto ndim = new_shape.size();

  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
  int dim = ndim;
@@ -494,27 +368,27 @@ void binary_op(
  // Can be sure dim > 0 since otherwise we would have used one of the fully
  // contiguous methods above. Except for the case that the flags do not
  // correspond to the underlying contiguity.
-  size_t stride;
  if (dim == 0 || strides[dim - 1] < 16) {
-    stride = 1;
    bopt = BinaryOpType::General;
    dim = ndim;
-  } else {
-    stride = strides[dim - 1];
  }

  switch (bopt) {
    case BinaryOpType::VectorVector:
-      binary_op_dispatch_dims<T, U>(a, b, out, opvv, dim, stride);
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvv, dim, new_shape, a_strides, b_strides, strides);
      break;
    case BinaryOpType::VectorScalar:
-      binary_op_dispatch_dims<T, U>(a, b, out, opvs, dim, stride);
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvs, dim, new_shape, a_strides, b_strides, strides);
      break;
    case BinaryOpType::ScalarVector:
-      binary_op_dispatch_dims<T, U>(a, b, out, opsv, dim, stride);
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opsv, dim, new_shape, a_strides, b_strides, strides);
      break;
    default:
-      binary_op_dispatch_dims<T, U>(a, b, out, op);
+      binary_op_dispatch_dims<T, U, false>(
+          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
      break;
  }
 }
@@ -531,9 +405,9 @@ void binary_op(
  // TODO: The following mess of constexpr evaluations can probably be achieved
  //       with template specializations and overloading. Would it be simpler?

-  if (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
-    if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
-      if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+  if constexpr (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
+    if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+      if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
        binary_op<T, T>(
            a,
@@ -554,7 +428,8 @@ void binary_op(
            DefaultVectorScalar<T, T, Op>(op),
            opvv);
      }
-    } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+    } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                             value) {
      // opsv and opvv were UseDefaultBinaryOp
      binary_op<T, T>(
          a,
@@ -569,7 +444,8 @@ void binary_op(
      binary_op<T, T>(
          a, b, out, op, DefaultScalarVector<T, T, Op>(op), opvs, opvv);
    }
-  } else if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+  } else if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::
+                           value) {
    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
      // opvs and opvv were UseDefaultBinaryOp
      binary_op<T, T>(
@@ -585,7 +461,8 @@ void binary_op(
      binary_op<T, T>(
          a, b, out, op, opsv, DefaultVectorScalar<T, T, Op>(op), opvv);
    }
-  } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+  } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                           value) {
    // opvv was UseDefaultBinaryOp
    binary_op<T, T>(
        a, b, out, op, opsv, opvs, DefaultVectorVector<T, T, Op>(op));
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -9,168 +9,43 @@ namespace mlx::core {

 namespace {

-template <typename T, typename U, typename Op>
-void binary_op_dims1(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < out_a.size(); ++i) {
-    auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
-    dst_a[i] = dst.first;
-    dst_b[i] = dst.second;
-    a_idx += a.strides()[0];
-    b_idx += b.strides()[0];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims1(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
+template <typename T, typename U, typename Op, int D>
+void binary_op_dims(
+    const T* a,
+    const T* b,
+    U* out_a,
+    U* out_b,
    Op op,
-    int stride) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; i++) {
-    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
-    a_idx += a.strides()[0];
-    b_idx += b.strides()[0];
-    dst_a += stride;
-    dst_b += stride;
-  }
-}
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];

-template <typename T, typename U, typename Op>
-void binary_op_dims2(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
-      dst_a[out_idx] = dst.first;
-      dst_b[out_idx++] = dst.second;
-      a_idx += a.strides()[1];
-      b_idx += b.strides()[1];
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      binary_op_dims<T, U, Op, D - 1>(
+          a,
+          b,
+          out_a,
+          out_b,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          axis + 1);
+    } else {
+      std::tie(*out_a, *out_b) = op(*a, *b);
    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims2(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op,
-    int stride) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
-      a_idx += a.strides()[1];
-      b_idx += b.strides()[1];
-      dst_a += stride;
-      dst_b += stride;
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims3(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
-        dst_a[out_idx] = dst.first;
-        dst_b[out_idx++] = dst.second;
-        a_idx += a.strides()[2];
-        b_idx += b.strides()[2];
-      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op_dims4(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {
-          auto dst = op(a_ptr[a_idx], b_ptr[b_idx]);
-          dst_a[out_idx] = dst.first;
-          dst_b[out_idx++] = dst.second;
-          a_idx += a.strides()[3];
-          b_idx += b.strides()[3];
-        }
-        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];
-        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];
-      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
+    a += stride_a;
+    b += stride_b;
+    out_a += stride_out;
+    out_b += stride_out;
  }
 }

@@ -181,352 +56,160 @@ void binary_op_dispatch_dims(
    array& out_a,
    array& out_b,
    Op op) {
-  switch (out_a.ndim()) {
+  auto [shape, strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), out_a.strides()});
+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& out_strides = strides[2];
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* out_a_ptr = out_a.data<U>();
+  U* out_b_ptr = out_b.data<U>();
+
+  int ndim = shape.size();
+  switch (ndim) {
    case 1:
-      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op);
+      binary_op_dims<T, U, Op, 1>(
+          a_ptr,
+          b_ptr,
+          out_a_ptr,
+          out_b_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
      return;
    case 2:
-      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op);
-      return;
-    case 3:
-      binary_op_dims3<T, U, Op>(a, b, out_a, out_b, op);
-      return;
-    case 4:
-      binary_op_dims4<T, U, Op>(a, b, out_a, out_b, op);
+      binary_op_dims<T, U, Op, 2>(
+          a_ptr,
+          b_ptr,
+          out_a_ptr,
+          out_b_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
      return;
  }

-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  for (size_t i = 0; i < out_a.size(); i++) {
-    int a_idx = elem_to_loc(i, a.shape(), a.strides());
-    int b_idx = elem_to_loc(i, b.shape(), b.strides());
-    std::tie(dst_a[i], dst_b[i]) = op(a_ptr[a_idx], b_ptr[b_idx]);
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
+  for (size_t elem = 0; elem < a.size(); elem += stride) {
+    binary_op_dims<T, U, Op, 2>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        out_a_ptr + elem,
+        out_b_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        out_strides,
+        ndim - 2);
+    a_it.step();
+    b_it.step();
  }
 }

-template <typename T, typename U, typename Op>
-void binary_op_dispatch_dims(
-    const array& a,
-    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op,
-    int dim,
-    int stride) {
-  // Number of dimensions to loop over for vectorized ops
-  switch (dim) {
-    case 1:
-      binary_op_dims1<T, U, Op>(a, b, out_a, out_b, op, stride);
-      return;
-    case 2:
-      binary_op_dims2<T, U, Op>(a, b, out_a, out_b, op, stride);
-      return;
-  }
-
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* dst_a = out_a.data<U>();
-  U* dst_b = out_b.data<U>();
-  for (size_t i = 0; i < out_a.size(); i += stride) {
-    int a_idx = elem_to_loc(i, a.shape(), a.strides());
-    int b_idx = elem_to_loc(i, b.shape(), b.strides());
-    op(a_ptr + a_idx, b_ptr + b_idx, dst_a, dst_b, stride);
-    dst_a += stride;
-    dst_b += stride;
-  }
-}
-
-template <
-    typename T,
-    typename U,
-    typename Op,
-    typename OpSV,
-    typename OpVS,
-    typename OpVV>
+template <typename T, typename U = T, typename Op>
 void binary_op(
    const array& a,
    const array& b,
-    array& out_a,
-    array& out_b,
-    Op op,
-    OpSV opsv,
-    OpVS opvs,
-    OpVV opvv) {
+    std::vector<array>& outputs,
+    Op op) {
  auto bopt = get_binary_op_type(a, b);
+  auto& out_a = outputs[0];
+  auto& out_b = outputs[1];
  set_binary_op_output_data(a, b, out_a, bopt);
  set_binary_op_output_data(a, b, out_b, bopt);

  // The full computation is scalar scalar so call the base op once
+  if (bopt == BinaryOpType::General) {
+    binary_op_dispatch_dims<T, U, Op>(a, b, out_a, out_b, op);
+    return;
+  }
+
+  auto a_ptr = a.data<T>();
+  auto b_ptr = b.data<T>();
+  auto out_a_ptr = out_a.data<U>();
+  auto out_b_ptr = out_b.data<U>();
  if (bopt == BinaryOpType::ScalarScalar) {
-    std::tie(*(out_a.data<U>()), *(out_b.data<U>())) =
-        op(*a.data<T>(), *b.data<T>());
-    return;
-  }
-
-  // The full computation is scalar vector so delegate to the op
-  if (bopt == BinaryOpType::ScalarVector) {
-    opsv(
-        a.data<T>(),
-        b.data<T>(),
-        out_a.data<U>(),
-        out_b.data<U>(),
-        b.data_size());
-    return;
-  }
-
-  // The full computation is vector scalar so delegate to the op
-  if (bopt == BinaryOpType::VectorScalar) {
-    opvs(
-        a.data<T>(),
-        b.data<T>(),
-        out_a.data<U>(),
-        out_b.data<U>(),
-        a.data_size());
-    return;
-  }
-
-  // The full computation is vector vector so delegate to the op
-  if (bopt == BinaryOpType::VectorVector) {
-    opvv(
-        a.data<T>(),
-        b.data<T>(),
-        out_a.data<U>(),
-        out_b.data<U>(),
-        out_a.size());
-    return;
-  }
-
-  // General computation so let's try to optimize
-
-  // Get the left-most dim such that the array is row contiguous after
-  auto& strides = out_a.strides();
-  auto leftmost_rc_dim = [&strides](const array& arr) {
-    int d = arr.ndim() - 1;
-    for (; d >= 0 && arr.strides()[d] == strides[d]; d--) {
+    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
+  } else if (bopt == BinaryOpType::ScalarVector) {
+    for (size_t i = 0; i < b.size(); ++i) {
+      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
+      out_a_ptr++;
+      out_b_ptr++;
+      b_ptr++;
    }
-    return d + 1;
-  };
-  auto a_rc_dim = leftmost_rc_dim(a);
-  auto b_rc_dim = leftmost_rc_dim(b);
-
-  // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const array& arr) {
-    int d = arr.ndim() - 1;
-    for (; d >= 0 && arr.strides()[d] == 0; d--) {
+  } else if (bopt == BinaryOpType::VectorScalar) {
+    for (size_t i = 0; i < a.size(); ++i) {
+      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
+      out_a_ptr++;
+      out_b_ptr++;
+      a_ptr++;
+    }
+  } else { // VectorVector
+    for (size_t i = 0; i < a.size(); ++i) {
+      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
+      out_a_ptr++;
+      out_b_ptr++;
+      a_ptr++;
+      b_ptr++;
    }
-    return d + 1;
-  };
-  auto a_s_dim = leftmost_s_dim(a);
-  auto b_s_dim = leftmost_s_dim(b);
-
-  auto ndim = out_a.ndim();
-
-  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
-  int dim = ndim;
-  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::VectorVector;
-    dim = d;
-    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
-    bopt = BinaryOpType::VectorScalar;
-    dim = d;
-    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::ScalarVector;
-    dim = d;
-  }
-
-  // Can be sure dim > 0 since otherwise we would have used one of the fully
-  // contiguous methods above. Except for the case that the flags do not
-  // correspond to the underlying contiguity.
-  size_t stride;
-  if (dim == 0 || strides[dim - 1] < 16) {
-    stride = 1;
-    bopt = BinaryOpType::General;
-    dim = ndim;
-  } else {
-    stride = strides[dim - 1];
-  }
-
-  switch (bopt) {
-    case BinaryOpType::VectorVector:
-      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvv, dim, stride);
-      break;
-    case BinaryOpType::VectorScalar:
-      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opvs, dim, stride);
-      break;
-    case BinaryOpType::ScalarVector:
-      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, opsv, dim, stride);
-      break;
-    default:
-      binary_op_dispatch_dims<T, U>(a, b, out_a, out_b, op);
-      break;
  }
 }

-template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>
-void binary_op(
-    const array& a,
-    const array& b,
-    std::vector<array>& outputs,
-    Op op,
-    OpSV opsv,
-    OpVS opvs,
-    OpVV opvv) {
-  // TODO: The following mess of constexpr evaluations can probably be achieved
-  //       with template specializations and overloading. Would it be simpler?
-
-  if (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
-    if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
-      if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
-        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
-        binary_op<T, T>(
-            a,
-            b,
-            outputs[0],
-            outputs[1],
-            op,
-            DefaultScalarVector<T, T, Op>(op),
-            DefaultVectorScalar<T, T, Op>(op),
-            DefaultVectorVector<T, T, Op>(op));
-      } else {
-        // opsv and opvs were UseDefaultBinaryOp
-        binary_op<T, T>(
-            a,
-            b,
-            outputs[0],
-            outputs[1],
-            op,
-            DefaultScalarVector<T, T, Op>(op),
-            DefaultVectorScalar<T, T, Op>(op),
-            opvv);
-      }
-    } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
-      // opsv and opvv were UseDefaultBinaryOp
-      binary_op<T, T>(
-          a,
-          b,
-          outputs[0],
-          outputs[1],
-          op,
-          DefaultScalarVector<T, T, Op>(op),
-          opvs,
-          DefaultVectorVector<T, T, Op>(op));
-    } else {
-      // opsv was UseDefaultBinaryOp
-      binary_op<T, T>(
-          a,
-          b,
-          outputs[0],
-          outputs[1],
-          op,
-          DefaultScalarVector<T, T, Op>(op),
-          opvs,
-          opvv);
-    }
-  } else if (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
-    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
-      // opvs and opvv were UseDefaultBinaryOp
-      binary_op<T, T>(
-          a,
-          b,
-          outputs[0],
-          outputs[1],
-          op,
-          opsv,
-          DefaultVectorScalar<T, T, Op>(op),
-          DefaultVectorVector<T, T, Op>(op));
-    } else {
-      // opvs was UseDefaultBinaryOp
-      binary_op<T, T>(
-          a,
-          b,
-          outputs[0],
-          outputs[1],
-          op,
-          opsv,
-          DefaultVectorScalar<T, T, Op>(op),
-          opvv);
-    }
-  } else if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
-    // opvv was UseDefaultBinaryOp
-    binary_op<T, T>(
-        a,
-        b,
-        outputs[0],
-        outputs[1],
-        op,
-        opsv,
-        opvs,
-        DefaultVectorVector<T, T, Op>(op));
-  } else {
-    // All ops provided
-    binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
-  }
-}
-
-template <typename T, typename Op>
-void binary_op(
-    const array& a,
-    const array& b,
-    std::vector<array>& outputs,
-    Op op) {
-  DefaultScalarVector<T, T, Op> opsv(op);
-  DefaultVectorScalar<T, T, Op> opvs(op);
-  DefaultVectorVector<T, T, Op> opvv(op);
-  binary_op<T, T>(a, b, outputs[0], outputs[1], op, opsv, opvs, opvv);
-}
-
-template <typename... Ops>
+template <typename Op>
 void binary(
    const array& a,
    const array& b,
    std::vector<array>& outputs,
-    Ops... ops) {
+    Op op) {
  switch (outputs[0].dtype()) {
    case bool_:
-      binary_op<bool>(a, b, outputs, ops...);
+      binary_op<bool>(a, b, outputs, op);
      break;
    case uint8:
-      binary_op<uint8_t>(a, b, outputs, ops...);
+      binary_op<uint8_t>(a, b, outputs, op);
      break;
    case uint16:
-      binary_op<uint16_t>(a, b, outputs, ops...);
+      binary_op<uint16_t>(a, b, outputs, op);
      break;
    case uint32:
-      binary_op<uint32_t>(a, b, outputs, ops...);
+      binary_op<uint32_t>(a, b, outputs, op);
      break;
    case uint64:
-      binary_op<uint64_t>(a, b, outputs, ops...);
+      binary_op<uint64_t>(a, b, outputs, op);
      break;
    case int8:
-      binary_op<int8_t>(a, b, outputs, ops...);
+      binary_op<int8_t>(a, b, outputs, op);
      break;
    case int16:
-      binary_op<int16_t>(a, b, outputs, ops...);
+      binary_op<int16_t>(a, b, outputs, op);
      break;
    case int32:
-      binary_op<int32_t>(a, b, outputs, ops...);
+      binary_op<int32_t>(a, b, outputs, op);
      break;
    case int64:
-      binary_op<int64_t>(a, b, outputs, ops...);
+      binary_op<int64_t>(a, b, outputs, op);
      break;
    case float16:
-      binary_op<float16_t>(a, b, outputs, ops...);
+      binary_op<float16_t>(a, b, outputs, op);
      break;
    case float32:
-      binary_op<float>(a, b, outputs, ops...);
+      binary_op<float>(a, b, outputs, op);
      break;
    case bfloat16:
-      binary_op<bfloat16_t>(a, b, outputs, ops...);
+      binary_op<bfloat16_t>(a, b, outputs, op);
      break;
    case complex64:
-      binary_op<complex64_t>(a, b, outputs, ops...);
+      binary_op<complex64_t>(a, b, outputs, op);
      break;
  }
 }
--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -2,46 +2,12 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <lapack.h>
-#endif
-
 namespace mlx::core {

-namespace {
-
-// Delegate to the Cholesky factorization taking into account differences in
-// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
-int spotrf_wrapper(char uplo, float* matrix, int N) {
-  int info;
-
-#ifdef LAPACK_FORTRAN_STRLEN_END
-  spotrf_(
-      /* uplo = */ &uplo,
-      /* n = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info,
-      /* uplo_len = */ static_cast<size_t>(1));
-#else
-  spotrf_(
-      /* uplo = */ &uplo,
-      /* n = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info);
-#endif
-
-  return info;
-}
-
-} // namespace
-
 void cholesky_impl(const array& a, array& factor, bool upper) {
  // Lapack uses the column-major convention. We take advantage of the fact that
  // the matrix should be symmetric:
@@ -66,7 +32,14 @@ void cholesky_impl(const array& a, array& factor, bool upper) {

  for (int i = 0; i < num_matrices; i++) {
    // Compute Cholesky factorization.
-    int info = spotrf_wrapper(uplo, matrix, N);
+    int info;
+    MLX_LAPACK_FUNC(spotrf)
+    (
+        /* uplo = */ &uplo,
+        /* n = */ &N,
+        /* a = */ matrix,
+        /* lda = */ &N,
+        /* info = */ &info);

    // TODO: We do nothing when the matrix is not positive semi-definite
    // because throwing an error would result in a crash. If we figure out how
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -39,7 +39,7 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  // rely on data_size anyway.
  size_t data_size = out.size();

-  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
+  return move_or_copy(in, out, strides_, flags, data_size, offset_);
 }

 void Broadcast::eval(const std::vector<array>& inputs, array& out) {
@@ -49,7 +49,7 @@ void Broadcast::eval(const std::vector<array>& inputs, array& out) {
    out.set_data(nullptr);
    return;
  }
-  std::vector<size_t> strides(out.ndim(), 0);
+  Strides strides(out.ndim(), 0);
  int diff = out.ndim() - in.ndim();
  for (int i = in.ndim() - 1; i >= 0; --i) {
    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
@@ -58,12 +58,12 @@ void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  if (out.size() > in.size()) {
    flags.row_contiguous = flags.col_contiguous = false;
  }
-  out.copy_shared_buffer(in, strides, flags, in.data_size());
+  move_or_copy(in, out, strides, flags, in.data_size());
 }

 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
+  move_or_copy(inputs[0], out);
 }

 void CustomTransforms::eval(
@@ -72,7 +72,7 @@ void CustomTransforms::eval(
  assert(inputs.size() > outputs.size());
  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
       i++, j++) {
-    outputs[i].copy_shared_buffer(inputs[j]);
+    move_or_copy(inputs[j], outputs[i]);
  }
 }

@@ -81,7 +81,7 @@ void Depends::eval(
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0; i < outputs.size(); i++) {
-    outputs[i].copy_shared_buffer(inputs[i]);
+    move_or_copy(inputs[i], outputs[i]);
  }
 }

@@ -141,7 +141,7 @@ void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
+std::pair<bool, Strides> Reshape::prepare_reshape(
    const array& in,
    const array& out) {
  // Special case for empty arrays or row contiguous arrays
@@ -151,17 +151,15 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(

  // Special case for scalars
  if (in.ndim() == 0) {
-    std::vector<size_t> out_strides(out.ndim(), 0);
-    return {false, out_strides};
+    return {false, Strides(out.ndim(), 0)};
  }

  // Firstly let's collapse all the contiguous dimensions of the input
-  auto [shape, _strides] = collapse_contiguous_dims(in);
-  auto& strides = _strides[0];
+  auto [shape, strides] = collapse_contiguous_dims(in);

  // If shapes fit exactly in the contiguous dims then no copy is necessary so
  // let's check.
-  std::vector<size_t> out_strides;
+  Strides out_strides;
  bool copy_necessary = false;
  int j = 0;
  for (int i = 0; i < out.ndim(); i++) {
@@ -184,7 +182,7 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(

 void Reshape::shared_buffer_reshape(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    array& out) {
  auto flags = in.flags();
  if (flags.row_contiguous) {
@@ -195,7 +193,7 @@ void Reshape::shared_buffer_reshape(
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+  move_or_copy(in, out, out_strides, flags, in.data_size());
 }

 void Split::eval(
@@ -250,26 +248,14 @@ void Split::eval(
  }
 }

-std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
-  }
-
-  return std::make_tuple(data_offset, inp_strides);
-}
-
 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  out.copy_shared_buffer(inputs[0]);
+  move_or_copy(inputs[0], out);
 }

 void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  std::vector<size_t> out_strides(out.ndim());
+  Strides out_strides(out.ndim());
  auto& in = inputs[0];
  for (int ax = 0; ax < axes_.size(); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
@@ -286,8 +272,8 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
  //   true, they stay true)
  auto flags = in.flags();
  if (flags.contiguous && in.data_size() == in.size()) {
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
    flags.col_contiguous = true;
    flags.row_contiguous = true;
    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
@@ -298,7 +284,7 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
      b_stride *= out.shape(ri);
    }
  }
-  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
+  move_or_copy(in, out, out_strides, flags, in.data_size());
 }

 } // namespace mlx::core
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -165,7 +165,7 @@ void compiled_allocate_outputs(
    bool move_buffers /* = false */) {
  if (contiguous) {
    int o = 0;
-    std::vector<size_t> strides;
+    Strides strides;
    size_t data_size;
    array::Flags flags;
    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
--- a/mlx/backend/common/compiled_cpu.cpp
+++ b/mlx/backend/common/compiled_cpu.cpp
@@ -4,6 +4,8 @@
 #include <filesystem>
 #include <fstream>
 #include <list>
+#include <mutex>
+#include <shared_mutex>

 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/compiled_preamble.h"
@@ -12,22 +14,7 @@

 namespace mlx::core {

-// GPU compile is always available if the GPU is available and since we are in
-// this file CPU compile is also available.
-namespace detail {
-bool compile_available_for_device(const Device& device) {
-  return true;
-}
-} // namespace detail
-
-std::string get_temp_file(const std::string& name) {
-  return std::filesystem::temp_directory_path().append(name);
-}
-
-// Return a pointer to a compiled function
-void* compile(
-    const std::string& kernel_name,
-    const std::string& source_code = "") {
+struct CompilerCache {
  struct DLib {
    DLib(const std::string& libname) {
      lib = dlopen(libname.c_str(), RTLD_NOW);
@@ -44,15 +31,41 @@ void* compile(
    void* lib;
  };
  // Statics to cache compiled libraries and functions
-  static std::list<DLib> libs;
-  static std::unordered_map<std::string, void*> kernels;
-  if (auto it = kernels.find(kernel_name); it != kernels.end()) {
-    return it->second;
-  }
-  if (source_code.empty()) {
-    return nullptr;
+  std::list<DLib> libs;
+  std::unordered_map<std::string, void*> kernels;
+  std::shared_mutex mtx;
+};
+
+static CompilerCache cache{};
+
+// GPU compile is always available if the GPU is available and since we are in
+// this file CPU compile is also available.
+namespace detail {
+bool compile_available_for_device(const Device& device) {
+  return true;
+}
+} // namespace detail
+
+std::string get_temp_file(const std::string& name) {
+  return std::filesystem::temp_directory_path().append(name).string();
+}
+
+// Return a pointer to a compiled function
+void* compile(
+    const std::string& kernel_name,
+    const std::function<std::string(void)>& source_builder) {
+  {
+    std::shared_lock lock(cache.mtx);
+    if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+      return it->second;
+    }
  }

+  std::unique_lock lock(cache.mtx);
+  if (auto it = cache.kernels.find(kernel_name); it != cache.kernels.end()) {
+    return it->second;
+  }
+  std::string source_code = source_builder();
  std::string kernel_file_name;

  // Deal with long kernel names. Maximum length for files on macOS is 255
@@ -90,8 +103,8 @@ void* compile(
    source_file.close();

    std::ostringstream build_command;
-    build_command << "g++ -std=c++17 -O2 -Wall -fPIC -shared "
-                  << source_file_path << " -o " << shared_lib_path;
+    build_command << "g++ -std=c++17 -O3 -Wall -fPIC -shared '"
+                  << source_file_path << "' -o '" << shared_lib_path << "'";
    std::string build_command_str = build_command.str();
    auto return_code = system(build_command_str.c_str());
    if (return_code) {
@@ -103,10 +116,10 @@ void* compile(
  }

  // load library
-  libs.emplace_back(shared_lib_path);
+  cache.libs.emplace_back(shared_lib_path);

  // Load function
-  void* fun = dlsym(libs.back().lib, kernel_name.c_str());
+  void* fun = dlsym(cache.libs.back().lib, kernel_name.c_str());
  if (!fun) {
    std::ostringstream msg;
    msg << "[Compile::eval_cpu] Failed to load compiled function "
@@ -114,7 +127,7 @@ void* compile(
        << dlerror();
    throw std::runtime_error(msg.str());
  }
-  kernels.insert({kernel_name, fun});
+  cache.kernels.insert({kernel_name, fun});
  return fun;
 }

@@ -266,7 +279,7 @@ void Compiled::eval_cpu(

  // Figure out which kernel we are using
  auto& shape = outputs[0].shape();
-  bool contiguous = compiled_check_contiguity(inputs, shape);
+  auto contiguous = compiled_check_contiguity(inputs, shape);

  // Handle all broadcasting and collect function input arguments
  std::vector<void*> args;
@@ -316,10 +329,7 @@ void Compiled::eval_cpu(
  }

  // Get the function
-  auto fn_ptr = compile(kernel_name);
-
-  // If it doesn't exist, compile it
-  if (fn_ptr == nullptr) {
+  auto fn_ptr = compile(kernel_name, [&]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
@@ -334,10 +344,8 @@ void Compiled::eval_cpu(
        ndim);
    // Close extern "C"
    kernel << "}" << std::endl;
-
-    // Compile and get function pointer
-    fn_ptr = compile(kernel_name, kernel.str());
-  }
+    return kernel.str();
+  });

  compiled_allocate_outputs(
      inputs, outputs, inputs_, constant_ids_, contiguous, false);
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -3,13 +3,8 @@
 #include <cassert>
 #include <numeric>

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <cblas.h>
-#endif
-
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -684,6 +679,32 @@ void dispatch_slow_conv_3D(
 // Explicit gemm conv
 ///////////////////////////////////////////////////////////////////////////////

+template <typename T>
+void flip_spatial_dims_inplace(array& wt) {
+  T* x = wt.data<T>();
+  size_t out_channels = wt.shape(0);
+  size_t in_channels = wt.shape(-1);
+
+  // Calculate the total size of the spatial dimensions
+  int spatial_size = 1;
+  for (int d = 1; d < wt.ndim() - 1; ++d) {
+    spatial_size *= wt.shape(d);
+  }
+
+  for (size_t i = 0; i < out_channels; i++) {
+    T* top = x + i * spatial_size * in_channels;
+    T* bottom =
+        x + i * spatial_size * in_channels + (spatial_size - 1) * in_channels;
+    for (size_t j = 0; j < spatial_size / 2; j++) {
+      for (size_t k = 0; k < in_channels; k++) {
+        std::swap(top[k], bottom[k]);
+      }
+      top += in_channels;
+      bottom -= in_channels;
+    }
+  }
+}
+
 void explicit_gemm_conv_1D_cpu(
    const array& in,
    const array& wt,
@@ -725,9 +746,9 @@ void explicit_gemm_conv_1D_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape = {N, oH, wH, C};
+  Shape strided_shape = {N, oH, wH, C};

-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[1],
@@ -844,9 +865,9 @@ void explicit_gemm_conv_2D_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape = {N, oH, oW, wH, wW, C};
+  Shape strided_shape = {N, oH, oW, wH, wW, C};

-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[2] * wt_strides[1],
@@ -910,7 +931,8 @@ void explicit_gemm_conv_ND_cpu(
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
-    const std::vector<int>& wt_dilation) {
+    const std::vector<int>& wt_dilation,
+    const bool flip) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const auto iDim = std::vector<int>(
      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
@@ -952,7 +974,7 @@ void explicit_gemm_conv_ND_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
+  Shape strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
  for (size_t i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
@@ -962,7 +984,7 @@ void explicit_gemm_conv_ND_cpu(
  }
  strided_shape.back() = C;

-  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
+  Strides strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
  for (size_t i = 0; i < wt_strides.size(); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
@@ -978,7 +1000,7 @@ void explicit_gemm_conv_ND_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  std::vector<int> strided_reshape = {N, C};
+  Shape strided_reshape = {N, C};
  for (const auto& o : oDim) {
    strided_reshape[0] *= o;
  }
@@ -1000,6 +1022,14 @@ void explicit_gemm_conv_ND_cpu(
    copy(wt, gemm_wt, ctype);
  }

+  if (flip) {
+    auto gemm_wt_ = array(gemm_wt.shape(), float32, nullptr, {});
+    copy(gemm_wt, gemm_wt_, CopyType::Vector);
+
+    flip_spatial_dims_inplace<float>(gemm_wt_);
+    gemm_wt = gemm_wt_;
+  }
+
  if (out.dtype() != float32) {
    gemm_out = array(out.shape(), float32, nullptr, {});
    gemm_out.set_data(allocator::malloc_or_wait(gemm_out.nbytes()));
@@ -1042,10 +1072,15 @@ void conv_1D_cpu(
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
+  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && !flip) {
    return explicit_gemm_conv_1D_cpu(
        in, wt, out, padding, wt_strides, wt_dilation);
  }
+  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && groups == 1) {
+    return explicit_gemm_conv_ND_cpu(
+        in, wt, out, padding, wt_strides, wt_dilation, flip);
+  }

  return dispatch_slow_conv_1D(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
@@ -1060,6 +1095,13 @@ void conv_2D_cpu(
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
+  const int groups = in.shape().back() / wt.shape().back();
+  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && in_dilation[0] == 1 &&
+      in_dilation[1] == 1 && groups == 1) {
+    return explicit_gemm_conv_ND_cpu(
+        in, wt, out, padding, wt_strides, wt_dilation, flip);
+  }
+
  return dispatch_slow_conv_2D(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }
@@ -1073,6 +1115,14 @@ void conv_3D_cpu(
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip) {
+  const int groups = in.shape().back() / wt.shape().back();
+  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && wt_dilation[2] == 1 &&
+      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1 &&
+      groups == 1) {
+    return explicit_gemm_conv_ND_cpu(
+        in, wt, out, padding, wt_strides, wt_dilation, flip);
+  }
+
  return dispatch_slow_conv_3D(
      in, wt, out, padding, wt_strides, wt_dilation, in_dilation, flip);
 }
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -26,292 +26,117 @@ void copy_vector(const array& src, array& dst) {
  std::copy(src_ptr, src_ptr + src.data_size(), dst_ptr);
 }

-template <typename SrcT, typename DstT, typename stride_t>
-void copy_general_dim1(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    int64_t i_offset) {
-  const SrcT* src_ptr = src.data<SrcT>();
-  DstT* dst_ptr = dst.data<DstT>();
-  stride_t src_idx = i_offset;
-  stride_t dst_idx = 0;
-  for (int i = 0; i < data_shape[0]; ++i) {
-    dst_ptr[dst_idx++] = static_cast<DstT>(src_ptr[src_idx]);
-    src_idx += i_strides[0];
+template <typename SrcT, typename DstT, int D>
+inline void copy_dims(
+    const SrcT* src,
+    DstT* dst,
+    const Shape& shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
+    int axis) {
+  auto stride_src = i_strides[axis];
+  auto stride_dst = o_strides[axis];
+  auto N = shape[axis];
+
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      copy_dims<SrcT, DstT, D - 1>(
+          src, dst, shape, i_strides, o_strides, axis + 1);
+    } else {
+      *dst = static_cast<DstT>(*src);
+    }
+    src += stride_src;
+    dst += stride_dst;
  }
 }

 template <typename SrcT, typename DstT>
-inline void copy_general_dim1(const array& src, array& dst) {
-  return copy_general_dim1<SrcT, DstT, size_t>(
-      src, dst, src.shape(), src.strides(), 0);
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
-void copy_general_dim2(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    int64_t i_offset) {
-  const SrcT* src_ptr = src.data<SrcT>();
-  DstT* dst_ptr = dst.data<DstT>();
-  stride_t src_idx = i_offset;
-  stride_t dst_idx = 0;
-  for (int i = 0; i < data_shape[0]; ++i) {
-    for (int j = 0; j < data_shape[1]; ++j) {
-      dst_ptr[dst_idx++] = static_cast<DstT>(src_ptr[src_idx]);
-      src_idx += i_strides[1];
-    }
-    src_idx += i_strides[0] - i_strides[1] * data_shape[1];
-  }
-}
-
-template <typename SrcT, typename DstT>
-inline void copy_general_dim2(const array& src, array& dst) {
-  return copy_general_dim2<SrcT, DstT, size_t>(
-      src, dst, src.shape(), src.strides(), 0);
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
-void copy_general_dim3(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    int64_t i_offset) {
-  const SrcT* src_ptr = src.data<SrcT>();
-  DstT* dst_ptr = dst.data<DstT>();
-  stride_t src_idx = i_offset;
-  stride_t dst_idx = 0;
-  for (int i = 0; i < data_shape[0]; ++i) {
-    for (int j = 0; j < data_shape[1]; ++j) {
-      for (int k = 0; k < data_shape[2]; ++k) {
-        dst_ptr[dst_idx++] = static_cast<DstT>(src_ptr[src_idx]);
-        src_idx += i_strides[2];
-      }
-      src_idx += i_strides[1] - i_strides[2] * data_shape[2];
-    }
-    src_idx += i_strides[0] - i_strides[1] * data_shape[1];
-  }
-}
-
-template <typename SrcT, typename DstT>
-inline void copy_general_dim3(const array& src, array& dst) {
-  return copy_general_dim3<SrcT, DstT, size_t>(
-      src, dst, src.shape(), src.strides(), 0);
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
-void copy_general_dim4(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    int64_t i_offset) {
-  const SrcT* src_ptr = src.data<SrcT>();
-  DstT* dst_ptr = dst.data<DstT>();
-  stride_t src_idx = i_offset;
-  stride_t dst_idx = 0;
-  for (int i = 0; i < data_shape[0]; ++i) {
-    for (int j = 0; j < data_shape[1]; ++j) {
-      for (int k = 0; k < data_shape[2]; ++k) {
-        for (int ii = 0; ii < data_shape[3]; ++ii) {
-          dst_ptr[dst_idx++] = static_cast<DstT>(src_ptr[src_idx]);
-          src_idx += i_strides[3];
-        }
-        src_idx += i_strides[2] - i_strides[3] * data_shape[3];
-      }
-      src_idx += i_strides[1] - i_strides[2] * data_shape[2];
-    }
-    src_idx += i_strides[0] - i_strides[1] * data_shape[1];
-  }
-}
-
-template <typename SrcT, typename DstT>
-inline void copy_general_dim4(const array& src, array& dst) {
-  return copy_general_dim4<SrcT, DstT, size_t>(
-      src, dst, src.shape(), src.strides(), 0);
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
-void copy_general(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    int64_t i_offset) {
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      data_shape, std::vector<std::vector<stride_t>>{i_strides});
-  switch (new_shape.size()) {
-    case 1:
-      copy_general_dim1<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
-      return;
-    case 2:
-      copy_general_dim2<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
-      return;
-    case 3:
-      copy_general_dim3<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
-      return;
-    case 4:
-      copy_general_dim4<SrcT, DstT, stride_t>(
-          src, dst, new_shape, new_strides[0], i_offset);
-      return;
-  }
-
-  auto src_ptr = src.data<SrcT>() + i_offset;
-  auto dst_ptr = dst.data<DstT>();
-  for (size_t i = 0; i < dst.size(); ++i) {
-    stride_t src_elem = elem_to_loc(i, new_shape, new_strides[0]);
-    dst_ptr[i] = static_cast<DstT>(src_ptr[src_elem]);
-  }
-}
-
-template <typename SrcT, typename DstT>
-inline void copy_general(const array& src, array& dst) {
-  return copy_general<SrcT, DstT, size_t>(
-      src, dst, src.shape(), src.strides(), 0);
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
-inline void copy_general(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset) {
-  return copy_general<SrcT, DstT, stride_t>(
-      src, dst, data_shape, i_strides, i_offset);
-}
-
-template <typename SrcT, typename DstT, typename stride_t, int D>
-inline void copy_general_general_dims(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset) {
-  if constexpr (D > 1) {
-    int axis = data_shape.size() - D;
-    auto stride_src = i_strides[axis];
-    auto stride_dst = o_strides[axis];
-    auto N = data_shape[axis];
-    for (int i = 0; i < N; i++) {
-      copy_general_general_dims<SrcT, DstT, stride_t, D - 1>(
-          src, dst, data_shape, i_strides, o_strides, i_offset, o_offset);
-      i_offset += stride_src;
-      o_offset += stride_dst;
-    }
-  } else {
-    int axis = data_shape.size() - 1;
-    auto stride_src = i_strides[axis];
-    auto stride_dst = o_strides[axis];
-    auto N = data_shape[axis];
-    const SrcT* src_ptr = src.data<SrcT>() + i_offset;
-    DstT* dst_ptr = dst.data<DstT>() + o_offset;
-    for (int i = 0; i < N; i++) {
-      *dst_ptr = static_cast<DstT>(*src_ptr);
-      src_ptr += stride_src;
-      dst_ptr += stride_dst;
-    }
-  }
-}
-
-template <typename SrcT, typename DstT, typename stride_t>
 void copy_general_general(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset) {
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      data_shape, std::vector<std::vector<stride_t>>{i_strides, o_strides});
-  switch (new_shape.size()) {
-    case 1:
-      copy_general_general_dims<SrcT, DstT, stride_t, 1>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
-      return;
-    case 2:
-      copy_general_general_dims<SrcT, DstT, stride_t, 2>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
-      return;
-    case 3:
-      copy_general_general_dims<SrcT, DstT, stride_t, 3>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
-      return;
-    case 4:
-      copy_general_general_dims<SrcT, DstT, stride_t, 4>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
-      return;
-    case 5:
-      copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-          src,
-          dst,
-          new_shape,
-          new_strides[0],
-          new_strides[1],
-          i_offset,
-          o_offset);
-      return;
+  if (data_shape.empty()) {
+    auto val = static_cast<DstT>(*(src.data<SrcT>() + i_offset));
+    auto dst_ptr = dst.data<DstT>() + o_offset;
+    *dst_ptr = val;
+    return;
  }
-
-  int size = std::accumulate(
-      new_shape.end() - 5, new_shape.end(), 1, std::multiplies<int>());
-  for (int i = 0; i < src.size(); i += size) {
-    stride_t src_offset = i_offset + elem_to_loc(i, new_shape, new_strides[0]);
-    stride_t dst_offset = o_offset + elem_to_loc(i, new_shape, new_strides[1]);
-    copy_general_general_dims<SrcT, DstT, stride_t, 5>(
-        src,
-        dst,
-        new_shape,
-        new_strides[0],
-        new_strides[1],
-        src_offset,
-        dst_offset);
+  auto [shape, strides] =
+      collapse_contiguous_dims(data_shape, {i_strides, o_strides});
+  auto src_ptr = src.data<SrcT>() + i_offset;
+  auto dst_ptr = dst.data<DstT>() + o_offset;
+  int ndim = shape.size();
+  if (ndim == 1) {
+    copy_dims<SrcT, DstT, 1>(
+        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    return;
+  } else if (ndim == 2) {
+    copy_dims<SrcT, DstT, 2>(
+        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    return;
+  } else if (ndim == 3) {
+    copy_dims<SrcT, DstT, 3>(
+        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
+    return;
+  }
+  ContiguousIterator in(shape, strides[0], ndim - 3);
+  ContiguousIterator out(shape, strides[1], ndim - 3);
+  auto stride = std::accumulate(
+      shape.end() - 3, shape.end(), 1, std::multiplies<int64_t>());
+  for (int64_t elem = 0; elem < src.size(); elem += stride) {
+    copy_dims<SrcT, DstT, 3>(
+        src_ptr + in.loc,
+        dst_ptr + out.loc,
+        shape,
+        strides[0],
+        strides[1],
+        ndim - 3);
+    in.step();
+    out.step();
  }
 }

 template <typename SrcT, typename DstT>
 inline void copy_general_general(const array& src, array& dst) {
-  return copy_general_general<SrcT, DstT, size_t>(
+  copy_general_general<SrcT, DstT>(
      src, dst, src.shape(), src.strides(), dst.strides(), 0, 0);
 }

+template <typename SrcT, typename DstT>
+void copy_general(
+    const array& src,
+    array& dst,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides&,
+    int64_t i_offset,
+    int64_t o_offset) {
+  copy_general_general<SrcT, DstT>(
+      src,
+      dst,
+      data_shape,
+      i_strides,
+      make_contiguous_strides(data_shape),
+      i_offset,
+      o_offset);
+}
+
+template <typename SrcT, typename DstT>
+inline void copy_general(const array& src, array& dst) {
+  copy_general_general<SrcT, DstT>(
+      src,
+      dst,
+      src.shape(),
+      src.strides(),
+      make_contiguous_strides(src.shape()),
+      0,
+      0);
+}
+
 template <typename SrcT, typename DstT, typename... Args>
 void copy(const array& src, array& dst, CopyType ctype, Args&&... args) {
  switch (ctype) {
@@ -326,6 +151,7 @@ void copy(const array& src, array& dst, CopyType ctype, Args&&... args) {
      return;
    case CopyType::GeneralGeneral:
      copy_general_general<SrcT, DstT>(src, dst, std::forward<Args>(args)...);
+      return;
  }
 }

@@ -426,7 +252,7 @@ inline void copy_inplace_dispatch(
 } // namespace

 void copy_inplace(const array& src, array& dst, CopyType ctype) {
-  return copy_inplace_dispatch(src, dst, ctype);
+  copy_inplace_dispatch(src, dst, ctype);
 }

 void copy(const array& src, array& dst, CopyType ctype) {
@@ -456,20 +282,19 @@ void copy(const array& src, array& dst, CopyType ctype) {
  copy_inplace(src, dst, ctype);
 }

-template <typename stride_t>
 void copy_inplace(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype) {
  switch (ctype) {
    case CopyType::General:
    case CopyType::GeneralGeneral:
-      return copy_inplace_dispatch(
+      copy_inplace_dispatch(
          src,
          dst,
          ctype,
@@ -478,31 +303,11 @@ void copy_inplace(
          o_strides,
          i_offset,
          o_offset);
-
+      break;
    case CopyType::Scalar:
    case CopyType::Vector:
-      return copy_inplace_dispatch(src, dst, ctype);
+      copy_inplace_dispatch(src, dst, ctype);
  }
 }

-template void copy_inplace<size_t>(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<size_t>& i_strides,
-    const std::vector<size_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
-template void copy_inplace<int64_t>(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<int64_t>& i_strides,
-    const std::vector<int64_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -26,13 +26,12 @@ enum class CopyType {
 void copy(const array& src, array& dst, CopyType ctype);
 void copy_inplace(const array& src, array& dst, CopyType ctype);

-template <typename stride_t>
 void copy_inplace(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype);
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -1,14 +1,10 @@
 // Copyright © 2023-2024 Apple Inc.

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <cblas.h>
-#endif
 #include <cstring>

 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@@ -114,6 +110,7 @@ DEFAULT(Tanh)
 DEFAULT(Transpose)
 DEFAULT(Inverse)
 DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)

 namespace {

@@ -133,7 +130,7 @@ inline void matmul_common_general(
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
-      size_t stx = arr.shape(-1);
+      stx = arr.shape(-1);
      return std::make_tuple(false, stx, arr_copy);
    }
  };
--- a/mlx/backend/common/eigh.cpp
+++ b/mlx/backend/common/eigh.cpp
@@ -0,0 +1,117 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/allocator.h"
+#include "mlx/array.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
+#include "mlx/linalg.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+void ssyevd(
+    char jobz,
+    char uplo,
+    float* a,
+    int N,
+    float* w,
+    float* work,
+    int lwork,
+    int* iwork,
+    int liwork) {
+  int info;
+  MLX_LAPACK_FUNC(ssyevd)
+  (
+      /* jobz = */ &jobz,
+      /* uplo = */ &uplo,
+      /* n = */ &N,
+      /* a = */ a,
+      /* lda = */ &N,
+      /* w = */ w,
+      /* work = */ work,
+      /* lwork = */ &lwork,
+      /* iwork = */ iwork,
+      /* liwork = */ &liwork,
+      /* info = */ &info);
+  if (info != 0) {
+    std::stringstream msg;
+    msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
+        << info;
+    throw std::runtime_error(msg.str());
+  }
+}
+
+} // namespace
+
+void Eigh::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
+  const auto& a = inputs[0];
+  auto& values = outputs[0];
+
+  auto vectors = compute_eigenvectors_
+      ? outputs[1]
+      : array(a.shape(), a.dtype(), nullptr, {});
+
+  values.set_data(allocator::malloc_or_wait(values.nbytes()));
+
+  copy(
+      a,
+      vectors,
+      a.flags().row_contiguous ? CopyType::Vector : CopyType::General);
+
+  if (compute_eigenvectors_) {
+    // Set the strides and flags so the eigenvectors
+    // are in the columns of the output
+    auto flags = vectors.flags();
+    auto strides = vectors.strides();
+    auto ndim = a.ndim();
+    std::swap(strides[ndim - 1], strides[ndim - 2]);
+
+    if (a.size() > 1) {
+      flags.row_contiguous = false;
+      if (ndim > 2) {
+        flags.col_contiguous = false;
+      } else {
+        flags.col_contiguous = true;
+      }
+    }
+    vectors.move_shared_buffer(vectors, strides, flags, vectors.data_size());
+  }
+
+  auto vec_ptr = vectors.data<float>();
+  auto eig_ptr = values.data<float>();
+
+  char jobz = compute_eigenvectors_ ? 'V' : 'N';
+  auto N = a.shape(-1);
+
+  // Work query
+  int lwork;
+  int liwork;
+  {
+    float work;
+    int iwork;
+    ssyevd(jobz, uplo_[0], nullptr, N, nullptr, &work, -1, &iwork, -1);
+    lwork = static_cast<int>(work);
+    liwork = iwork;
+  }
+
+  auto work_buf = array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
+  auto iwork_buf = array::Data{allocator::malloc_or_wait(sizeof(int) * liwork)};
+  for (size_t i = 0; i < a.size() / (N * N); ++i) {
+    ssyevd(
+        jobz,
+        uplo_[0],
+        vec_ptr,
+        N,
+        eig_ptr,
+        static_cast<float*>(work_buf.buffer.raw_ptr()),
+        lwork,
+        static_cast<int*>(iwork_buf.buffer.raw_ptr()),
+        liwork);
+    vec_ptr += N * N;
+    eig_ptr += N;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/indexing.cpp
+++ b/mlx/backend/common/indexing.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2023 Apple Inc.
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -33,7 +32,7 @@ void gather(
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
-    const std::vector<int>& slice_sizes) {
+    const Shape& slice_sizes) {
  // If the array is row contiguous then we can do a contiguous copy given
  // two conditions on the slice size:
  // - Any number of leading ones in the slice sizes are allowed
@@ -81,11 +80,17 @@ void gather(
  T* dst_ptr = out.data<T>();
  size_t out_idx = 0;

+  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
+  ContiguousIterator src_it;
+  if (!can_copy && src.ndim() > 0) {
+    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
+  }
  for (int idx = 0; idx < ind_size; idx++) {
    size_t src_idx = 0;
    for (int ii = 0; ii < inds.size(); ++ii) {
      auto ax = axes[ii];
-      auto idx_loc = elem_to_loc(idx, inds[ii]);
+      auto idx_loc = its[ii].loc;
+      its[ii].step();
      auto idx_val =
          offset_neg_idx(inds[ii].data<IdxT>()[idx_loc], src.shape(ax));
      src_idx += (idx_val * src.strides()[ax]);
@@ -99,9 +104,10 @@ void gather(
      out_idx += slice_size;
    } else {
      for (int jj = 0; jj < slice_size; jj++) {
-        auto src_offset = elem_to_loc(jj, slice_sizes, src.strides());
-        dst_ptr[out_idx++] = src_ptr[src_idx + src_offset];
+        dst_ptr[out_idx++] = src_ptr[src_idx + src_it.loc];
+        src_it.step();
      }
+      src_it.reset();
    }
  }
 }
@@ -112,7 +118,7 @@ void dispatch_gather(
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
-    const std::vector<int>& size) {
+    const Shape& size) {
  switch (out.dtype()) {
    case bool_:
      gather<bool, IdxT>(src, inds, out, axes, size);
@@ -216,28 +222,36 @@ void scatter(
  auto inds_ndim = updates.ndim() - out.ndim();
  size_t n_updates = nind ? inds[0].size() : 1;

-  std::vector<int> update_shape(
+  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
  size_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }

+  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
+  ContiguousIterator update_it(updates);
+  ContiguousIterator out_it(update_shape, out.strides(), out.ndim());
+
  for (int i = 0; i < n_updates; ++i) {
    size_t out_offset = 0;
    for (int j = 0; j < nind; ++j) {
      auto ax = axes[j];
-      auto idx_loc = elem_to_loc(i, inds[j]);
+      auto idx_loc = its[j].loc;
+      its[j].step();
      auto idx_val =
          offset_neg_idx(inds[j].data<IdxT>()[idx_loc], out.shape(ax));
      out_offset += (idx_val * out.strides()[ax]);
    }
+    update_it.seek(i * update_size);
    for (int j = 0; j < update_size; ++j) {
-      auto update_loc = elem_to_loc(i * update_size + j, updates);
-      auto out_loc = elem_to_loc(j, update_shape, out.strides());
-      op(updates.data<InT>()[update_loc],
-         out.data<InT>() + out_offset + out_loc);
+      op(updates.data<InT>()[update_it.loc],
+         out.data<InT>() + out_offset + out_it.loc);
+      update_it.step();
+      out_it.step();
    }
+    out_it.reset();
+    update_it.reset();
  }
 }

--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -2,39 +2,19 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <lapack.h>
-#endif
-
-// Wrapper to account for differences in
-// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
 int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
  int info;
-
-#ifdef LAPACK_FORTRAN_STRLEN_END
-  strtri_(
-      /* uplo = */ &uplo,
-      /* diag = */ &diag,
-      /* N = */ &N,
-      /* a = */ matrix,
-      /* lda = */ &N,
-      /* info = */ &info,
-      /* uplo_len = */ static_cast<size_t>(1),
-      /* diag_len = */ static_cast<size_t>(1));
-#else
-  strtri_(
+  MLX_LAPACK_FUNC(strtri)
+  (
      /* uplo = */ &uplo,
      /* diag = */ &diag,
      /* N = */ &N,
      /* a = */ matrix,
      /* lda = */ &N,
      /* info = */ &info);
-#endif
-
  return info;
 }

--- a/mlx/backend/common/lapack_helper.h
+++ b/mlx/backend/common/lapack_helper.h
@@ -1,10 +1,20 @@
-// Copyright © 2024 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #pragma once

+// Required for Visual Studio.
+// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
+#ifdef _MSC_VER
+#include <complex>
+#define LAPACK_COMPLEX_CUSTOM
+#define lapack_complex_float std::complex<float>
+#define lapack_complex_double std::complex<double>
+#endif
+
 #ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
+#include <cblas.h>
 #include <lapack.h>
 #endif

--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -18,10 +18,12 @@ if [ "$CLANG" = "TRUE" ]; then
  #include <cstdint>
  #include <vector>
 EOM
-
+CC_FLAGS=""
+else
+CC_FLAGS="-std=c++17"
 fi

-CONTENT=$($GCC -I "$SRCDIR" -E "$SRCDIR/mlx/backend/common/compiled_preamble.h" 2>/dev/null)
+CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E "$SRCDIR/mlx/backend/common/compiled_preamble.h" 2>/dev/null)

 cat << EOF > "$OUTPUT_FILE"
 const char* get_kernel_preamble() {
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -1,15 +1,10 @@
 // Copyright © 2024 Apple Inc.

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <cblas.h>
-#endif
-
 #include <cstring>

 #include "mlx/array.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"

@@ -24,10 +19,10 @@ inline void mask_matrix(
    int block_size,
    const int X,
    const int Y,
-    const size_t X_data_str,
-    const size_t Y_data_str,
-    const size_t X_mask_str,
-    const size_t Y_mask_str,
+    const int64_t X_data_str,
+    const int64_t Y_data_str,
+    const int64_t X_mask_str,
+    const int64_t Y_mask_str,
    const size_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;
@@ -89,7 +84,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
        } else {
          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
          copy(arr, arr_copy, CopyType::General);
-          size_t stx = arr.shape(-1);
+          int64_t stx = arr.shape(-1);
          return std::make_tuple(false, stx, arr_copy);
        }
      };
@@ -122,13 +117,13 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str) {
-    size_t mask_offset = elem_to_loc(
+    auto mask_offset = elem_to_loc(
        mask.shape(-1) * mask.shape(-2) * batch_idx,
        mask.shape(),
        mask.strides());

-    size_t X_mask_str = mask.strides()[mask.ndim() - 2];
-    size_t Y_mask_str = mask.strides()[mask.ndim() - 1];
+    auto X_mask_str = mask.strides()[mask.ndim() - 2];
+    auto Y_mask_str = mask.strides()[mask.ndim() - 1];

    if (mask.dtype() == bool_) {
      return mask_matrix(
@@ -235,7 +230,7 @@ void GatherMM::eval(const std::vector<array>& inputs, array& out) {
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
-      size_t stx = arr.shape(-1);
+      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, arr_copy);
    }
  };
@@ -267,13 +262,13 @@ void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];

-  std::vector<int> batch_shape = get_batch_dims(out.shape());
+  auto batch_shape = get_batch_dims(out.shape());
  int batch_ndim = batch_shape.size();

-  std::vector<int> batch_shape_A = get_batch_dims(a.shape());
-  std::vector<size_t> batch_strides_A = get_batch_dims(a.strides());
-  std::vector<int> batch_shape_B = get_batch_dims(b.shape());
-  std::vector<size_t> batch_strides_B = get_batch_dims(b.strides());
+  auto batch_shape_A = get_batch_dims(a.shape());
+  auto batch_strides_A = get_batch_dims(a.strides());
+  auto batch_shape_B = get_batch_dims(b.shape());
+  auto batch_strides_B = get_batch_dims(b.strides());

  const uint32_t* lhs_indices_ptr = lhs_indices.data<uint32_t>();
  const uint32_t* rhs_indices_ptr = rhs_indices.data<uint32_t>();
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -295,6 +295,13 @@ struct Floor {
  }
 };

+struct Imag {
+  template <typename T>
+  T operator()(T x) {
+    return std::imag(x);
+  }
+};
+
 struct Log {
  template <typename T>
  T operator()(T x) {
@@ -337,6 +344,13 @@ struct Negative {
  }
 };

+struct Real {
+  template <typename T>
+  T operator()(T x) {
+    return std::real(x);
+  }
+};
+
 struct Round {
  template <typename T>
  T operator()(T x) {
@@ -486,7 +500,12 @@ struct Equal {
 struct NaNEqual {
  template <typename T>
  bool operator()(T x, T y) {
-    return x == y || (std::isnan(x) && std::isnan(y));
+    if constexpr (std::is_integral_v<T>) {
+      // isnan always returns false for integers, and MSVC refuses to compile.
+      return x == y;
+    } else {
+      return x == y || (std::isnan(x) && std::isnan(y));
+    }
  }
 };

--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -159,6 +159,17 @@ void Conjugate::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.flags().row_contiguous ||
+      (allow_col_major_ && in.flags().col_contiguous)) {
+    out.copy_shared_buffer(in);
+  } else {
+    copy(in, out, CopyType::General);
+  }
+}
+
 void Cos::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -273,6 +284,10 @@ void Full::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, ctype);
 }

+void Imag::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Imag());
+}
+
 void Log::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -398,6 +413,10 @@ void RandomBits::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void Real::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Real());
+}
+
 void Reshape::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
@@ -406,16 +425,7 @@ void Reshape::eval(const std::vector<array>& inputs, array& out) {

  if (copy_necessary) {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    auto out_strides = make_contiguous_strides<size_t>(in.shape());
-    copy_inplace<size_t>(
-        in,
-        out,
-        in.shape(),
-        in.strides(),
-        out_strides,
-        0,
-        0,
-        CopyType::General);
+    copy_inplace(in, out, CopyType::General);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
@@ -488,14 +498,15 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] =
-      prepare_slice(in, start_indices_, strides_);
+  auto [data_offset, inp_strides] = prepare_slice(in, start_indices_, strides_);
+  auto copy_needed = std::any_of(
+      strides_.begin(), strides_.end(), [](auto i) { return i < 0; });

  // Do copy if needed
  if (copy_needed) {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
-    copy_inplace<int64_t>(
+    Strides ostrides{out.strides().begin(), out.strides().end()};
+    copy_inplace(
        /* const array& src = */ in,
        /* array& dst = */ out,
        /* const std::vector<int>& data_shape = */ out.shape(),
@@ -513,7 +524,7 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
      }
    }
    size_t data_size = data_end - data_offset;
-    std::vector<size_t> ostrides{inp_strides.begin(), inp_strides.end()};
+    Strides ostrides{inp_strides.begin(), inp_strides.end()};
    shared_buffer_slice(in, ostrides, data_offset, data_size, out);
  }
 }
@@ -540,11 +551,11 @@ void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [data_offset, out_strides] = prepare_slice(out);
+  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);

  // Do copy
-  std::vector<int64_t> upd_strides{upd.strides().begin(), upd.strides().end()};
-  copy_inplace<int64_t>(
+  Strides upd_strides{upd.strides().begin(), upd.strides().end()};
+  copy_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
@@ -607,7 +618,7 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
      in.flags().row_contiguous) {
    auto strides = in.strides();
-    for (int i = 0; i < strides.size() - 1; ++i) {
+    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
      strides[i] *= ibytes;
      strides[i] /= obytes;
    }
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -2,14 +2,9 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

-#ifdef ACCELERATE_NEW_LAPACK
-#include <Accelerate/Accelerate.h>
-#else
-#include <lapack.h>
-#endif
-
 namespace mlx::core {

 template <typename T>
@@ -59,7 +54,7 @@ void qrf_impl(const array& a, array& q, array& r) {
  // Copy the input to be column contiguous
  flags.col_contiguous = num_matrices == 1;
  flags.row_contiguous = false;
-  std::vector<size_t> strides = in.strides();
+  auto strides = in.strides();
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -2,13 +2,38 @@

 #include <cassert>

-#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/ops.h"
+#include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
+#include "mlx/utils.h"

 namespace mlx::core {

 namespace {

+template <typename T, int bits>
+void extract_bits(const uint8_t* w_in, T* w_out) {
+  assert(bits == 3 || bits == 6);
+  if (bits == 3) {
+    w_out[0] = static_cast<T>(w_in[0] & 0x7);
+    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
+    w_out[2] = static_cast<T>(((w_in[0] & 0xc0) >> 6) + ((w_in[1] & 0x1) << 2));
+    w_out[3] = static_cast<T>((w_in[1] & 0xe) >> 1);
+    w_out[4] = static_cast<T>((w_in[1] & 0x70) >> 4);
+    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
+    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
+    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
+  } else if (bits == 6) {
+    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
+    w_out[1] =
+        static_cast<T>(((w_in[0] >> 6) & 0x03) + ((w_in[1] & 0x0f) << 2));
+    w_out[2] =
+        static_cast<T>(((w_in[1] >> 4) & 0x0f) + ((w_in[2] & 0x03) << 4));
+    w_out[3] = static_cast<T>((w_in[2] >> 2) & 0x3f);
+  }
+}
+
 template <typename T, int bits, int group_size>
 void _qmm(
    T* result,
@@ -20,13 +45,12 @@ void _qmm(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = 32 / bits;
+  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
+  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
  constexpr int packs_in_group = group_size / pack_factor;
-  const int Ng = N / group_size;
-  const int Nw = N / pack_factor;

  for (int m = 0; m < M; m++) {
-    const uint32_t* w_local = w;
+    const uint8_t* w_local = (const uint8_t*)w;
    const T* scales_local = scales;
    const T* biases_local = biases;

@@ -40,13 +64,25 @@ void _qmm(
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
-          uint32_t wi = *w_local++;
-
+          if (bits == 3 || bits == 6) {
+            T wl[pack_factor];
+            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
-          for (int p = 0; p < pack_factor; p++) {
-            (*result_local++) +=
-                xi * (scale * static_cast<T>(wi & bitmask) + bias);
-            wi >>= bits;
+            for (int p = 0; p < pack_factor; p++) {
+              (*result_local++) += xi * (scale * wl[p] + bias);
+            }
+            w_local += bytes_per_pack;
+
+          } else {
+            uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+            for (int p = 0; p < pack_factor; p++) {
+              (*result_local++) +=
+                  xi * (scale * static_cast<T>(wi & bitmask) + bias);
+              if (bits != 8) {
+                wi >>= bits;
+              }
+            }
          }
        }
      }
@@ -67,13 +103,12 @@ void _qmm_t(
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
-  constexpr int pack_factor = 32 / bits;
+  constexpr int pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
+  constexpr int bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;
  constexpr int packs_in_group = group_size / pack_factor;
-  const int Kg = K / group_size;
-  const int Kw = K / pack_factor;

  for (int m = 0; m < M; m++) {
-    const uint32_t* w_local = w;
+    const uint8_t* w_local = (const uint8_t*)w;
    const T* scales_local = scales;
    const T* biases_local = biases;

@@ -85,12 +120,26 @@ void _qmm_t(
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
-          uint32_t wi = *w_local++;
-
+          if (bits == 3 || bits == 6) {
+            T wl[pack_factor];
+            extract_bits<T, bits>(w_local, wl);
 #pragma clang loop unroll(full)
-          for (int p = 0; p < pack_factor; p++) {
-            sum += (*x_local++) * (scale * static_cast<T>(wi & bitmask) + bias);
-            wi >>= bits;
+            for (int p = 0; p < pack_factor; p++) {
+              sum += x_local[p] * (scale * wl[p] + bias);
+            }
+            w_local += bytes_per_pack;
+            x_local += pack_factor;
+
+          } else {
+            uint8_t wi = *w_local++;
+#pragma clang loop unroll(full)
+            for (int p = 0; p < pack_factor; p++) {
+              sum +=
+                  (*x_local++) * (scale * static_cast<T>(wi & bitmask) + bias);
+              if (bits != 8) {
+                wi >>= bits;
+              }
+            }
          }
        }
      }
@@ -102,6 +151,55 @@ void _qmm_t(
  }
 }

+template <typename T, int bits, int group_size>
+void _qmm_dispatch_transpose(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const T* scales,
+    const T* biases,
+    int M,
+    int N,
+    int K,
+    bool transposed_w) {
+  if (transposed_w) {
+    return _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
+  } else {
+    return _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
+  }
+}
+
+template <typename T, int bits>
+void _qmm_dispatch_group(
+    T* result,
+    const T* x,
+    const uint32_t* w,
+    const T* scales,
+    const T* biases,
+    int M,
+    int N,
+    int K,
+    int group_size,
+    bool transposed_w) {
+  switch (group_size) {
+    case 32:
+      _qmm_dispatch_transpose<T, bits, 32>(
+          result, x, w, scales, biases, M, N, K, transposed_w);
+      break;
+    case 64:
+      _qmm_dispatch_transpose<T, bits, 64>(
+          result, x, w, scales, biases, M, N, K, transposed_w);
+      break;
+    case 128:
+      _qmm_dispatch_transpose<T, bits, 128>(
+          result, x, w, scales, biases, M, N, K, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument(
+          "Quantization group size must be 32, 64 or 128.");
+  }
+}
+
 template <typename T>
 void _qmm_dispatch_typed(
    T* result,
@@ -116,79 +214,29 @@ void _qmm_dispatch_typed(
    int bits,
    bool transposed_w) {
  switch (bits) {
-    case 2: {
-      switch (group_size) {
-        case 32:
-          if (transposed_w) {
-            return _qmm_t<T, 2, 32>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 2, 32>(result, x, w, scales, biases, M, N, K);
-          }
-        case 64:
-          if (transposed_w) {
-            return _qmm_t<T, 2, 64>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 2, 64>(result, x, w, scales, biases, M, N, K);
-          }
-        case 128:
-          if (transposed_w) {
-            return _qmm_t<T, 2, 128>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 2, 128>(result, x, w, scales, biases, M, N, K);
-          }
-      }
-    }
-    case 4: {
-      switch (group_size) {
-        case 32:
-          if (transposed_w) {
-            return _qmm_t<T, 4, 32>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 4, 32>(result, x, w, scales, biases, M, N, K);
-          }
-        case 64:
-          if (transposed_w) {
-            return _qmm_t<T, 4, 64>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 4, 64>(result, x, w, scales, biases, M, N, K);
-          }
-        case 128:
-          if (transposed_w) {
-            return _qmm_t<T, 4, 128>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 4, 128>(result, x, w, scales, biases, M, N, K);
-          }
-      }
-    }
-    case 8: {
-      switch (group_size) {
-        case 32:
-          if (transposed_w) {
-            return _qmm_t<T, 8, 32>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 8, 32>(result, x, w, scales, biases, M, N, K);
-          }
-        case 64:
-          if (transposed_w) {
-            return _qmm_t<T, 8, 64>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 8, 64>(result, x, w, scales, biases, M, N, K);
-          }
-        case 128:
-          if (transposed_w) {
-            return _qmm_t<T, 8, 128>(result, x, w, scales, biases, M, N, K);
-          } else {
-            return _qmm<T, 8, 128>(result, x, w, scales, biases, M, N, K);
-          }
-      }
-    }
+    case 2:
+      _qmm_dispatch_group<T, 2>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
+    case 3:
+      _qmm_dispatch_group<T, 3>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
+    case 4:
+      _qmm_dispatch_group<T, 4>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
+    case 6:
+      _qmm_dispatch_group<T, 6>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
+    case 8:
+      _qmm_dispatch_group<T, 8>(
+          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
+      break;
+    default:
+      throw std::invalid_argument("Quantization bits must be 2, 3, 4, 6 or 8.");
  }
-  std::ostringstream msg;
-  msg << "Quantization type not supported. Provided bits=" << bits
-      << " and group_size=" << group_size
-      << ". The supported options are bits in "
-      << "{2, 4, 8} and group_size in {64, 128}.";
-  throw std::invalid_argument(msg.str());
 }

 void _qmm_dispatch(
@@ -201,55 +249,61 @@ void _qmm_dispatch(
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
-  int M = x.size() / K;
+  int M = x.shape(-2);
  int N = out.shape(-1);

-  switch (x.dtype()) {
-    case float32:
-      _qmm_dispatch_typed<float>(
-          out.data<float>(),
-          x.data<float>(),
-          w.data<uint32_t>(),
-          scales.data<float>(),
-          biases.data<float>(),
-          M,
-          N,
-          K,
-          bits,
-          group_size,
-          transposed_w);
-      break;
-    case float16:
-      _qmm_dispatch_typed<float16_t>(
-          out.data<float16_t>(),
-          x.data<float16_t>(),
-          w.data<uint32_t>(),
-          scales.data<float16_t>(),
-          biases.data<float16_t>(),
-          M,
-          N,
-          K,
-          bits,
-          group_size,
-          transposed_w);
-      break;
-    case bfloat16:
-      _qmm_dispatch_typed<bfloat16_t>(
-          out.data<bfloat16_t>(),
-          x.data<bfloat16_t>(),
-          w.data<uint32_t>(),
-          scales.data<bfloat16_t>(),
-          biases.data<bfloat16_t>(),
-          M,
-          N,
-          K,
-          bits,
-          group_size,
-          transposed_w);
-      break;
-    default:
-      throw std::invalid_argument(
-          "[quantized_matmul] only floating types are supported");
+  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
+  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
+
+  int batch_size = x.size() / x.shape(-1) / x.shape(-2);
+  for (int i = 0; i < batch_size; i++) {
+    switch (x.dtype()) {
+      case float32:
+        _qmm_dispatch_typed<float>(
+            out.data<float>() + i * M * N,
+            x.data<float>() + elem_to_loc(i * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(i * w_els, w),
+            scales.data<float>() + elem_to_loc(i * g_els, scales),
+            biases.data<float>() + elem_to_loc(i * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      case float16:
+        _qmm_dispatch_typed<float16_t>(
+            out.data<float16_t>() + i * M * N,
+            x.data<float16_t>() + elem_to_loc(i * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(i * w_els, w),
+            scales.data<float16_t>() + elem_to_loc(i * g_els, scales),
+            biases.data<float16_t>() + elem_to_loc(i * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      case bfloat16:
+        _qmm_dispatch_typed<bfloat16_t>(
+            out.data<bfloat16_t>() + i * M * N,
+            x.data<bfloat16_t>() + elem_to_loc(i * M * K, x),
+            w.data<uint32_t>() + elem_to_loc(i * w_els, w),
+            scales.data<bfloat16_t>() + elem_to_loc(i * g_els, scales),
+            biases.data<bfloat16_t>() + elem_to_loc(i * g_els, biases),
+            M,
+            N,
+            K,
+            bits,
+            group_size,
+            transposed_w);
+        break;
+      default:
+        throw std::invalid_argument(
+            "[quantized_matmul] only floating types are supported");
+    }
  }
 }

@@ -398,4 +452,114 @@ void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
      transpose_);
 }

+template <typename T, typename U>
+void quantize(
+    const array& w_,
+    array& out_,
+    array& scales_,
+    array& biases_,
+    int bits,
+    int group_size) {
+  const T* w = w_.data<T>();
+
+  auto out = out_.data<U>();
+  T* scales = scales_.data<T>();
+  T* biases = biases_.data<T>();
+
+  T n_bins = (1 << bits) - 1;
+  T eps = 1e-7;
+  bool power_of_2_bits = is_power_of_2(bits);
+  int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
+  // For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
+  int bytes_per_pack = power_of_2_bits ? 1 : 3;
+  int int_per_group = group_size * bytes_per_pack / el_per_int;
+  size_t n_groups = w_.size() / group_size;
+
+  for (size_t i = 0; i < n_groups; ++i) {
+    size_t w_idx = i * group_size;
+    T w_min = std::numeric_limits<float>::infinity();
+    T w_max = -w_min;
+    for (int j = 0; j < group_size; ++j) {
+      w_max = std::max(w_max, w[w_idx + j]);
+      w_min = std::min(w_min, w[w_idx + j]);
+    }
+    bool mask = std::abs(w_min) > std::abs(w_max);
+    T scale = std::max(T((w_max - w_min) / n_bins), eps);
+    scale = mask ? scale : -scale;
+
+    auto edge = mask ? w_min : w_max;
+    auto q0 = std::rint(edge / scale);
+    if (q0 == 0) {
+      scales[i] = scale;
+      biases[i] = 0;
+    } else {
+      scales[i] = edge / q0;
+      biases[i] = edge;
+    }
+    size_t out_idx = i * int_per_group;
+    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
+      uint32_t out_el = 0;
+      for (int k = 0; k < el_per_int; ++k) {
+        T w_el = w[w_idx + j * el_per_int + k];
+        w_el = std::rint((w_el - biases[i]) / scales[i]);
+        w_el = std::min(std::max(w_el, T(0)), n_bins);
+        out_el |= static_cast<uint32_t>(w_el) << (k * bits);
+      }
+      if (power_of_2_bits) {
+        out[out_idx + j] = out_el;
+      } else {
+        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
+        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
+        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
+      }
+    }
+  }
+}
+
+void fast::AffineQuantize::eval_cpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
+  auto ensure_row_contiguous = [](const array& arr) {
+    if (arr.flags().row_contiguous) {
+      return arr;
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      return arr_copy;
+    }
+  };
+  auto w = ensure_row_contiguous(inputs[0]);
+
+  auto& out = outputs[0];
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& scales = outputs[1];
+  auto& biases = outputs[2];
+  scales.set_data(allocator::malloc_or_wait(scales.nbytes()));
+  biases.set_data(allocator::malloc_or_wait(biases.nbytes()));
+  if (w.dtype() == float16) {
+    if (is_power_of_2(bits_)) {
+      quantize<float16_t, uint32_t>(w, out, scales, biases, bits_, group_size_);
+    } else {
+      quantize<float16_t, uint8_t>(w, out, scales, biases, bits_, group_size_);
+    }
+  } else if (w.dtype() == bfloat16) {
+    if (is_power_of_2(bits_)) {
+      quantize<bfloat16_t, uint32_t>(
+          w, out, scales, biases, bits_, group_size_);
+    } else {
+      quantize<bfloat16_t, uint8_t>(w, out, scales, biases, bits_, group_size_);
+    }
+  } else if (w.dtype() == float32) {
+    if (is_power_of_2(bits_)) {
+      quantize<float, uint32_t>(w, out, scales, biases, bits_, group_size_);
+    } else {
+      quantize<float, uint8_t>(w, out, scales, biases, bits_, group_size_);
+    }
+  } else {
+    throw std::runtime_error(
+        "[fast::AffineQuantize::eval_cpu] Only supports floating point inputs");
+  }
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -120,65 +120,73 @@ struct MinReduce {
 };

 template <typename InT>
-void reduce_dispatch_out(
+void reduce_dispatch_and_or(
    const array& in,
    array& out,
    Reduce::ReduceType rtype,
    const std::vector<int>& axes) {
-  switch (rtype) {
-    case Reduce::And: {
-      reduction_op<InT, bool>(in, out, axes, true, AndReduce());
-      break;
+  if (rtype == Reduce::And) {
+    reduction_op<InT, bool>(in, out, axes, true, AndReduce());
+  } else {
+    reduction_op<InT, bool>(in, out, axes, false, OrReduce());
+  }
+}
+
+template <typename InT>
+void reduce_dispatch_sum_prod(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Sum) {
+    auto op = [](auto y, auto x) { (*y) = (*y) + x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 0, op);
+    } else {
+      reduction_op<InT, InT>(in, out, axes, 0, op);
    }
-    case Reduce::Or: {
-      reduction_op<InT, bool>(in, out, axes, false, OrReduce());
-      break;
-    }
-    case Reduce::Sum: {
-      auto op = [](auto y, auto x) { (*y) = (*y) + x; };
-      if (out.dtype() == int32) {
-        // special case since the input type can be bool
-        reduction_op<InT, int32_t>(in, out, axes, 0, op);
-      } else {
-        reduction_op<InT, InT>(in, out, axes, 0, op);
-      }
-      break;
-    }
-    case Reduce::Prod: {
-      auto op = [](auto y, auto x) { (*y) *= x; };
+  } else {
+    auto op = [](auto y, auto x) { (*y) *= x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 1, op);
+    } else {
      reduction_op<InT, InT>(in, out, axes, 1, op);
-      break;
-    }
-    case Reduce::Max: {
-      auto init = Limits<InT>::min;
-      reduction_op<InT, InT>(in, out, axes, init, MaxReduce());
-      break;
-    }
-    case Reduce::Min: {
-      auto init = Limits<InT>::max;
-      reduction_op<InT, InT>(in, out, axes, init, MinReduce());
-      break;
    }
  }
 }

+template <typename InT>
+void reduce_dispatch_min_max(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Max) {
+    auto init = Limits<InT>::min;
+    reduction_op<InT, InT>(in, out, axes, init, MaxReduce());
+  } else {
+    auto init = Limits<InT>::max;
+    reduction_op<InT, InT>(in, out, axes, init, MinReduce());
+  }
+}
+
 } // namespace

 void nd_loop(
    std::function<void(int)> callback,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
+    const Shape& shape,
+    const Strides& strides) {
  std::function<void(int, int)> loop_inner;
  loop_inner = [&](int dim, int offset) {
    if (dim < shape.size() - 1) {
-      int size = shape[dim];
-      size_t stride = strides[dim];
+      auto size = shape[dim];
+      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        loop_inner(dim + 1, offset + i * stride);
      }
    } else {
-      int size = shape[dim];
-      size_t stride = strides[dim];
+      auto size = shape[dim];
+      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        callback(offset + i * stride);
      }
@@ -190,46 +198,114 @@ void nd_loop(
 void Reduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
-  switch (in.dtype()) {
-    case bool_:
-      reduce_dispatch_out<bool>(in, out, reduce_type_, axes_);
+  switch (reduce_type_) {
+    case Reduce::And:
+    case Reduce::Or: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_and_or<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+        case float16:
+        case bfloat16:
+          reduce_dispatch_and_or<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+        case int32:
+        case float32:
+          reduce_dispatch_and_or<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+        case int64:
+        case complex64:
+          reduce_dispatch_and_or<int64_t>(in, out, reduce_type_, axes_);
+          break;
+      }
      break;
-    case uint8:
-      reduce_dispatch_out<uint8_t>(in, out, reduce_type_, axes_);
+    }
+    case Reduce::Sum:
+    case Reduce::Prod: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+          reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+        case uint32:
+          reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+        case uint64:
+          reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_sum_prod<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_sum_prod<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_sum_prod<float>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_sum_prod<complex64_t>(in, out, reduce_type_, axes_);
+          break;
+      }
      break;
-    case uint16:
-      reduce_dispatch_out<uint16_t>(in, out, reduce_type_, axes_);
-      break;
-    case uint32:
-      reduce_dispatch_out<uint32_t>(in, out, reduce_type_, axes_);
-      break;
-    case uint64:
-      reduce_dispatch_out<uint64_t>(in, out, reduce_type_, axes_);
-      break;
-    case int8:
-      reduce_dispatch_out<uint8_t>(in, out, reduce_type_, axes_);
-      break;
-    case int16:
-      reduce_dispatch_out<uint16_t>(in, out, reduce_type_, axes_);
-      break;
-    case int32:
-      reduce_dispatch_out<int32_t>(in, out, reduce_type_, axes_);
-      break;
-    case int64:
-      reduce_dispatch_out<int64_t>(in, out, reduce_type_, axes_);
-      break;
-    case float16:
-      reduce_dispatch_out<float16_t>(in, out, reduce_type_, axes_);
-      break;
-    case float32:
-      reduce_dispatch_out<float>(in, out, reduce_type_, axes_);
-      break;
-    case bfloat16:
-      reduce_dispatch_out<bfloat16_t>(in, out, reduce_type_, axes_);
-      break;
-    case complex64:
-      reduce_dispatch_out<complex64_t>(in, out, reduce_type_, axes_);
+    }
+    case Reduce::Max:
+    case Reduce::Min: {
+      switch (in.dtype()) {
+        case bool_:
+          reduce_dispatch_min_max<bool>(in, out, reduce_type_, axes_);
+          break;
+        case uint8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+          reduce_dispatch_min_max<uint32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+          reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
+          break;
+        case int8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+          reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+          reduce_dispatch_min_max<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_min_max<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_min_max<float>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_min_max<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_min_max<complex64_t>(in, out, reduce_type_, axes_);
+          break;
+      }
      break;
+    }
  }
 }

--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -38,13 +38,10 @@ enum ReductionOpType {

 struct ReductionPlan {
  ReductionOpType type;
-  std::vector<int> shape;
-  std::vector<size_t> strides;
+  Shape shape;
+  Strides strides;

-  ReductionPlan(
-      ReductionOpType type_,
-      std::vector<int> shape_,
-      std::vector<size_t> strides_)
+  ReductionPlan(ReductionOpType type_, Shape shape_, Strides strides_)
      : type(type_), shape(std::move(shape_)), strides(std::move(strides_)) {}
  ReductionPlan(ReductionOpType type_) : type(type_) {}
 };
@@ -55,10 +52,10 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 // Should this be in utils?
 void nd_loop(
    std::function<void(int)> callback,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides);
+    const Shape& shape,
+    const Strides& strides);

-std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
+std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);

@@ -113,9 +110,6 @@ void reduction_op(
    return;
  }

-  std::vector<int> shape;
-  std::vector<size_t> strides;
-
  if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
    int reduction_size = plan.shape[0];
    const T* x_ptr = x.data<T>();
@@ -135,7 +129,7 @@ void reduction_op(
    U* out_ptr = out.data<U>();
    // Unrolling the following loop (and implementing it in order for
    // ContiguousReduce) should hold extra performance boost.
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i++, out_ptr++) {
        int offset = elem_to_loc(i, shape, strides);
@@ -181,7 +175,7 @@ void reduction_op(
    plan.strides.pop_back();
    const T* x_ptr = x.data<T>();
    U* out_ptr = out.data<U>();
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i += reduction_stride) {
        int offset = elem_to_loc(i, shape, strides);
@@ -211,7 +205,7 @@ void reduction_op(
  if (plan.type == GeneralReduce) {
    const T* x_ptr = x.data<T>();
    U* out_ptr = out.data<U>();
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    for (int i = 0; i < out.size(); i++, out_ptr++) {
      int offset = elem_to_loc(i, shape, strides);
      U val = init;
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -4,11 +4,11 @@

 namespace mlx::core {

-std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
+std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes) {
-  std::vector<int> shape = x.shape();
-  std::vector<size_t> strides = x.strides();
+  auto shape = x.shape();
+  auto strides = x.strides();

  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
@@ -29,8 +29,8 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // Row contiguous input so the output is row contiguous
  if (x.flags().row_contiguous) {
    // Merge consecutive axes
-    std::vector<int> shape = {x.shape(axes[0])};
-    std::vector<size_t> strides = {x.strides()[axes[0]]};
+    Shape shape = {x.shape(axes[0])};
+    Strides strides = {x.strides()[axes[0]]};
    for (int i = 1; i < axes.size(); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
@@ -69,7 +69,7 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {

  // Sort reduction axes by stride in order to merge them and figure out if we
  // have a contiguous reduction.
-  std::vector<std::pair<int, size_t>> reductions;
+  std::vector<std::pair<int, int64_t>> reductions;
  for (auto a : axes) {
    if (x.shape(a) > 1) {
      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
@@ -93,8 +93,8 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
    }
  }

-  std::vector<int> shape;
-  std::vector<size_t> strides;
+  Shape shape;
+  Strides strides;
  for (auto r : reductions) {
    shape.push_back(r.first);
    strides.push_back(r.second);
@@ -109,15 +109,15 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // Delegate to the general strided reduction op if the axes after
  // strides.back() are contiguous.
  if (strides.back() > 1) {
-    int size = 1;
+    int64_t size = 1;
    bool have_expand = false;
    for (int i = x.ndim() - 1; i >= 0; i--) {
      if (axes.back() == i) {
        continue;
      }

-      size_t stride_i = x.strides()[i];
-      int shape_i = x.shape(i);
+      auto stride_i = x.strides()[i];
+      auto shape_i = x.shape(i);
      if (stride_i == 0) {
        if (shape_i == 1) {
          continue;
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -4,24 +4,22 @@

 namespace mlx::core {

-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
-    const std::vector<int>& start_indices,
-    const std::vector<int>& strides) {
+    const Shape& start_indices,
+    const Shape& strides) {
  int64_t data_offset = 0;
-  bool copy_needed = false;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  Strides inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
-    copy_needed |= strides[i] < 0;
  }
-  return std::make_tuple(copy_needed, data_offset, inp_strides);
+  return std::make_tuple(data_offset, inp_strides);
 }

 void shared_buffer_slice(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    size_t data_offset,
    size_t data_size,
    array& out) {
@@ -34,7 +32,7 @@ void shared_buffer_slice(
  flags.col_contiguous = is_col_contiguous;
  flags.contiguous = (no_bsx_size == data_size);

-  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
+  move_or_copy(in, out, out_strides, flags, data_size, data_offset);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -6,14 +6,14 @@

 namespace mlx::core {

-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
-    const std::vector<int>& start_indices,
-    const std::vector<int>& strides);
+    const Shape& start_indices,
+    const Shape& strides);

 void shared_buffer_slice(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    size_t data_offset,
    size_t data_size,
    array& out);
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -25,7 +25,7 @@ struct StridedIterator {
  // Constructors
  StridedIterator() = default;

-  explicit StridedIterator(T* ptr, size_t stride, difference_type offset = 0)
+  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
      : ptr_(ptr + offset * stride), stride_(stride) {}

  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
@@ -99,7 +99,7 @@ struct StridedIterator {
  }

 private:
-  size_t stride_;
+  int64_t stride_;
  T* ptr_;
 };

@@ -111,7 +111,8 @@ void sort(const array& in, array& out, int axis) {

  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  size_t in_size = in.flags().contiguous ? in.data_size() : in.size();
+  size_t n_rows = in_size / in.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -119,18 +120,20 @@ void sort(const array& in, array& out, int axis) {
  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = out.strides()[axis];
-  int axis_size = out.shape(axis);
+  auto axis_stride = out.strides()[axis];
+  auto axis_size = out.shape(axis);

  // Perform sorting in place
+  ContiguousIterator src_it(
+      remaining_shape, remaining_strides, remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
-    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
-    T* data_ptr = out.data<T>() + loc;
+    T* data_ptr = out.data<T>() + src_it.loc;

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

    std::stable_sort(st, ed);
+    src_it.step();
  }
 }

@@ -155,16 +158,20 @@ void argsort(const array& in, array& out, int axis) {
  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);

-  size_t in_stride = in.strides()[axis];
-  size_t out_stride = out.strides()[axis];
-  int axis_size = in.shape(axis);
+  auto in_stride = in.strides()[axis];
+  auto out_stride = out.strides()[axis];
+  auto axis_size = in.shape(axis);

  // Perform sorting
+  ContiguousIterator in_it(
+      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
+  ContiguousIterator out_it(
+      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
-    size_t in_loc = elem_to_loc(i, in_remaining_shape, in_remaining_strides);
-    size_t out_loc = elem_to_loc(i, out_remaining_shape, out_remaining_strides);
-    const T* data_ptr = in.data<T>() + in_loc;
-    IdxT* idx_ptr = out.data<IdxT>() + out_loc;
+    const T* data_ptr = in.data<T>() + in_it.loc;
+    IdxT* idx_ptr = out.data<IdxT>() + out_it.loc;
+    in_it.step();
+    out_it.step();

    StridedIterator st_(idx_ptr, out_stride, 0);
    StridedIterator ed_(idx_ptr, out_stride, axis_size);
@@ -192,7 +199,8 @@ void partition(const array& in, array& out, int axis, int kth) {

  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
-  size_t n_rows = in.size() / in.shape(axis);
+  size_t in_size = in.flags().contiguous ? in.data_size() : in.size();
+  size_t n_rows = in_size / in.shape(axis);

  auto remaining_shape = in.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);
@@ -200,15 +208,17 @@ void partition(const array& in, array& out, int axis, int kth) {
  auto remaining_strides = in.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = in.strides()[axis];
+  auto axis_stride = in.strides()[axis];
  int axis_size = in.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition in place
+  ContiguousIterator src_it(
+      remaining_shape, remaining_strides, remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
-    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
-    T* data_ptr = out.data<T>() + loc;
+    T* data_ptr = out.data<T>() + src_it.loc;
+    src_it.step();

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator md(data_ptr, axis_stride, kth);
@@ -227,37 +237,49 @@ void argpartition(const array& in, array& out, int axis, int kth) {
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

-  auto remaining_shape = in.shape();
-  remaining_shape.erase(remaining_shape.begin() + axis);
+  auto in_remaining_shape = in.shape();
+  in_remaining_shape.erase(in_remaining_shape.begin() + axis);

-  auto remaining_strides = in.strides();
-  remaining_strides.erase(remaining_strides.begin() + axis);
+  auto in_remaining_strides = in.strides();
+  in_remaining_strides.erase(in_remaining_strides.begin() + axis);

-  size_t axis_stride = in.strides()[axis];
-  int axis_size = in.shape(axis);
+  auto out_remaining_shape = out.shape();
+  out_remaining_shape.erase(out_remaining_shape.begin() + axis);
+
+  auto out_remaining_strides = out.strides();
+  out_remaining_strides.erase(out_remaining_strides.begin() + axis);
+
+  auto in_stride = in.strides()[axis];
+  auto out_stride = out.strides()[axis];
+  auto axis_size = in.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition
+  ContiguousIterator in_it(
+      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
+  ContiguousIterator out_it(
+      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
-    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
-    const T* data_ptr = in.data<T>() + loc;
-    IdxT* idx_ptr = out.data<IdxT>() + loc;
+    const T* data_ptr = in.data<T>() + in_it.loc;
+    IdxT* idx_ptr = out.data<IdxT>() + out_it.loc;
+    in_it.step();
+    out_it.step();

-    StridedIterator st_(idx_ptr, axis_stride, 0);
-    StridedIterator ed_(idx_ptr, axis_stride, axis_size);
+    StridedIterator st_(idx_ptr, out_stride, 0);
+    StridedIterator ed_(idx_ptr, out_stride, axis_size);

    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));

    // Sort according to vals
-    StridedIterator st(idx_ptr, axis_stride, 0);
-    StridedIterator md(idx_ptr, axis_stride, kth);
-    StridedIterator ed(idx_ptr, axis_stride, axis_size);
+    StridedIterator st(idx_ptr, out_stride, 0);
+    StridedIterator md(idx_ptr, out_stride, kth);
+    StridedIterator ed(idx_ptr, out_stride, axis_size);

-    std::nth_element(st, md, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
-      auto v1 = data_ptr[a * axis_stride];
-      auto v2 = data_ptr[b * axis_stride];
+    std::nth_element(st, md, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
+      auto v1 = data_ptr[a * in_stride];
+      auto v2 = data_ptr[b * in_stride];
      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -2,7 +2,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/common/copy.h"
-#include "mlx/backend/common/lapack_helper.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -71,128 +71,46 @@ void set_ternary_op_output_data(
      break;
  }
 }
+template <typename T1, typename T2, typename T3, typename U, typename Op, int D>
+void ternary_op_dims(
+    const T1* a,
+    const T2* b,
+    const T3* c,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& c_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_c = c_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];

-template <typename T1, typename T2, typename T3, typename U, typename Op>
-void ternary_op_dims1(
-    const array& a,
-    const array& b,
-    const array& c,
-    array& out,
-    Op op) {
-  const T1* a_ptr = a.data<T1>();
-  const T2* b_ptr = b.data<T2>();
-  const T3* c_ptr = c.data<T3>();
-
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t c_idx = 0;
-  for (size_t i = 0; i < out.size(); ++i) {
-    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx], c_ptr[c_idx]);
-    a_idx += a.strides()[0];
-    b_idx += b.strides()[0];
-    c_idx += c.strides()[0];
-  }
-}
-
-template <typename T1, typename T2, typename T3, typename U, typename Op>
-void ternary_op_dims2(
-    const array& a,
-    const array& b,
-    const array& c,
-    array& out,
-    Op op) {
-  const T1* a_ptr = a.data<T1>();
-  const T2* b_ptr = b.data<T2>();
-  const T3* c_ptr = c.data<T3>();
-
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t c_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx], c_ptr[c_idx]);
-      a_idx += a.strides()[1];
-      b_idx += b.strides()[1];
-      c_idx += c.strides()[1];
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      ternary_op_dims<T1, T2, T3, U, Op, D - 1>(
+          a,
+          b,
+          c,
+          out,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          axis + 1);
+    } else {
+      *out = op(*a, *b, *c);
    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-    c_idx += c.strides()[0] - c.strides()[1] * c.shape()[1];
-  }
-}
-
-template <typename T1, typename T2, typename T3, typename U, typename Op>
-void ternary_op_dims3(
-    const array& a,
-    const array& b,
-    const array& c,
-    array& out,
-    Op op) {
-  const T1* a_ptr = a.data<T1>();
-  const T2* b_ptr = b.data<T2>();
-  const T3* c_ptr = c.data<T3>();
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t c_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx], c_ptr[c_idx]);
-        a_idx += a.strides()[2];
-        b_idx += b.strides()[2];
-        c_idx += c.strides()[2];
-      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
-      c_idx += c.strides()[1] - c.strides()[2] * c.shape()[2];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-    c_idx += c.strides()[0] - c.strides()[1] * c.shape()[1];
-  }
-}
-
-template <typename T1, typename T2, typename T3, typename U, typename Op>
-void ternary_op_dims4(
-    const array& a,
-    const array& b,
-    const array& c,
-    array& out,
-    Op op) {
-  const T1* a_ptr = a.data<T1>();
-  const T2* b_ptr = b.data<T2>();
-  const T3* c_ptr = c.data<T3>();
-
-  U* dst = out.data<U>();
-  size_t a_idx = 0;
-  size_t b_idx = 0;
-  size_t c_idx = 0;
-  size_t out_idx = 0;
-  for (size_t i = 0; i < a.shape()[0]; ++i) {
-    for (size_t j = 0; j < a.shape()[1]; ++j) {
-      for (size_t k = 0; k < a.shape()[2]; ++k) {
-        for (size_t ii = 0; ii < a.shape()[3]; ++ii) {
-          dst[out_idx++] = op(a_ptr[a_idx], b_ptr[b_idx], c_ptr[c_idx]);
-          a_idx += a.strides()[3];
-          b_idx += b.strides()[3];
-          c_idx += c.strides()[3];
-        }
-        a_idx += a.strides()[2] - a.strides()[3] * a.shape()[3];
-        b_idx += b.strides()[2] - b.strides()[3] * b.shape()[3];
-        c_idx += c.strides()[2] - c.strides()[3] * c.shape()[3];
-      }
-      a_idx += a.strides()[1] - a.strides()[2] * a.shape()[2];
-      b_idx += b.strides()[1] - b.strides()[2] * b.shape()[2];
-      c_idx += c.strides()[1] - c.strides()[2] * c.shape()[2];
-    }
-    a_idx += a.strides()[0] - a.strides()[1] * a.shape()[1];
-    b_idx += b.strides()[0] - b.strides()[1] * b.shape()[1];
-    c_idx += c.strides()[0] - c.strides()[1] * c.shape()[1];
+    a += stride_a;
+    b += stride_b;
+    c += stride_c;
+    out += stride_out;
  }
 }

@@ -203,30 +121,69 @@ void ternary_op_dispatch_dims(
    const array& c,
    array& out,
    Op op) {
-  switch (out.ndim()) {
-    case 1:
-      ternary_op_dims1<T1, T2, T3, U, Op>(a, b, c, out, op);
-      return;
-    case 2:
-      ternary_op_dims2<T1, T2, T3, U, Op>(a, b, c, out, op);
-      return;
-    case 3:
-      ternary_op_dims3<T1, T2, T3, U, Op>(a, b, c, out, op);
-      return;
-    case 4:
-      ternary_op_dims4<T1, T2, T3, U, Op>(a, b, c, out, op);
-      return;
-  }
+  auto [shape, strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), c.strides(), out.strides()});
+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& c_strides = strides[2];
+  const auto& out_strides = strides[3];

  const T1* a_ptr = a.data<T1>();
  const T2* b_ptr = b.data<T2>();
  const T3* c_ptr = c.data<T3>();
-  U* dst = out.data<U>();
-  for (size_t i = 0; i < out.size(); i++) {
-    int a_idx = elem_to_loc(i, a.shape(), a.strides());
-    int b_idx = elem_to_loc(i, b.shape(), b.strides());
-    int c_idx = elem_to_loc(i, c.shape(), c.strides());
-    dst[i] = op(a_ptr[a_idx], b_ptr[b_idx], c_ptr[c_idx]);
+  U* out_ptr = out.data<T3>();
+  int ndim = shape.size();
+  switch (ndim) {
+    case 1:
+      ternary_op_dims<T1, T2, T3, U, Op, 1>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      ternary_op_dims<T1, T2, T3, U, Op, 2>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+  }
+
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  ContiguousIterator c_it(shape, c_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
+  for (size_t elem = 0; elem < a.size(); elem += stride) {
+    ternary_op_dims<T1, T2, T3, U, Op, 2>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        c_ptr + c_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        c_strides,
+        out_strides,
+        ndim - 2);
+    a_it.step();
+    b_it.step();
+    c_it.step();
  }
 }

@@ -243,10 +200,21 @@ void ternary_op(
  // The full computation is scalar-scalar-scalar so we call the base op once.
  if (topt == TernaryOpType::ScalarScalarScalar) {
    *(out.data<U>()) = op(*a.data<T1>(), *b.data<T2>(), *c.data<T3>());
-    return;
+  } else if (topt == TernaryOpType::VectorVectorVector) {
+    const T1* a_ptr = a.data<T1>();
+    const T2* b_ptr = b.data<T2>();
+    const T3* c_ptr = c.data<T3>();
+    U* out_ptr = out.data<U>();
+    for (size_t i = 0; i < out.size(); ++i) {
+      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
+      a_ptr++;
+      b_ptr++;
+      c_ptr++;
+      out_ptr++;
+    }
+  } else {
+    ternary_op_dispatch_dims<T1, T2, T3, U>(a, b, c, out, op);
  }
-
-  ternary_op_dispatch_dims<T1, T2, T3, U>(a, b, c, out, op);
 }

 } // namespace
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -24,22 +24,36 @@ void set_unary_output_data(const array& in, array& out) {
  }
 }

-template <typename T, typename Op>
+template <typename T, typename U = T, typename Op>
+void unary_op(const T* a, U* out, Op op, size_t shape, size_t stride) {
+  for (size_t i = 0; i < shape; i += 1) {
+    out[i] = op(*a);
+    a += stride;
+  }
+}
+
+template <typename T, typename U = T, typename Op>
 void unary_op(const array& a, array& out, Op op) {
  const T* a_ptr = a.data<T>();
  if (a.flags().contiguous) {
    set_unary_output_data(a, out);
-    T* dst = out.data<T>();
+    U* dst = out.data<U>();
    for (size_t i = 0; i < a.data_size(); ++i) {
      dst[i] = op(a_ptr[i]);
    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    T* dst = out.data<T>();
-    for (size_t i = 0; i < out.size(); ++i) {
-      // TODO this is super inefficient, need to fix.
-      int a_idx = elem_to_loc(i, a.shape(), a.strides());
-      dst[i] = op(a_ptr[a_idx]);
+    U* dst = out.data<U>();
+    size_t shape = a.ndim() > 0 ? a.shape(-1) : 1;
+    size_t stride = a.ndim() > 0 ? a.strides(-1) : 1;
+    if (a.ndim() <= 1) {
+      unary_op(a_ptr, dst, op, shape, stride);
+      return;
+    }
+    ContiguousIterator it(a.shape(), a.strides(), a.ndim() - 1);
+    for (size_t elem = 0; elem < a.size(); elem += shape) {
+      unary_op(a_ptr + it.loc, dst + elem, op, shape, stride);
+      it.step();
    }
  }
 }
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -0,0 +1,126 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+void move_or_copy(const array& in, array& out) {
+  if (in.is_donatable()) {
+    out.move_shared_buffer(in);
+  } else {
+    out.copy_shared_buffer(in);
+  }
+}
+
+void move_or_copy(
+    const array& in,
+    array& out,
+    const Strides& strides,
+    array::Flags flags,
+    size_t data_size,
+    size_t offset /* = 0 */) {
+  if (in.is_donatable()) {
+    out.move_shared_buffer(in, strides, flags, data_size, offset);
+  } else {
+    out.copy_shared_buffer(in, strides, flags, data_size, offset);
+  }
+}
+
+std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
+    const Shape& shape,
+    const std::vector<Strides>& strides,
+    int64_t size_cap) {
+  // Make a vector that has axes separated with -1. Collapse all axes between
+  // -1.
+  Shape to_collapse;
+  if (shape.size() > 0) {
+    if (shape[0] != 1) {
+      to_collapse.push_back(0);
+    }
+    size_t size = shape[0];
+    for (int i = 1; i < shape.size(); i++) {
+      bool contiguous = true;
+      size *= shape[i];
+      for (const auto& st : strides) {
+        if (st[i] * shape[i] != st[i - 1] || size > size_cap) {
+          contiguous = false;
+          size = shape[i];
+          break;
+        }
+      }
+      if (!contiguous) {
+        to_collapse.push_back(-1);
+      }
+      if (shape[i] != 1) {
+        to_collapse.push_back(i);
+      }
+    }
+    to_collapse.push_back(-1);
+  }
+
+  Shape out_shape;
+  std::vector<Strides> out_strides(strides.size());
+  for (int i = 0;;) {
+    while (i < to_collapse.size() && to_collapse[i] == -1) {
+      ++i;
+    };
+    if (i == to_collapse.size()) {
+      break;
+    }
+    int current_shape = shape[to_collapse[i]];
+    int k = i;
+    while (to_collapse[++k] != -1) {
+      current_shape *= shape[to_collapse[k]];
+    }
+    out_shape.push_back(current_shape);
+    for (int j = 0; j < strides.size(); j++) {
+      const auto& st = strides[j];
+      out_strides[j].push_back(st[to_collapse[k - 1]]);
+    }
+    i = k + 1;
+  }
+
+  if (!shape.empty() && out_shape.empty()) {
+    out_shape.push_back(1);
+    for (auto& out_stride : out_strides) {
+      out_stride.push_back(0);
+    }
+  }
+  return std::make_tuple(out_shape, out_strides);
+}
+
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const Shape& shape,
+    const Strides& strides,
+    int64_t size_cap) {
+  Shape collapsed_shape;
+  Strides collapsed_strides;
+
+  if (shape.size() > 0) {
+    collapsed_shape.push_back(shape[0]);
+    collapsed_strides.push_back(strides[0]);
+    for (int i = 1; i < shape.size(); i++) {
+      if (shape[i] == 1) {
+        continue;
+      } else if (
+          strides[i] * shape[i] != collapsed_strides.back() ||
+          collapsed_shape.back() * static_cast<int64_t>(shape[i]) > size_cap) {
+        collapsed_shape.push_back(shape[i]);
+        collapsed_strides.push_back(strides[i]);
+      } else {
+        collapsed_shape.back() *= shape[i];
+        collapsed_strides.back() = strides[i];
+      }
+    }
+  }
+
+  return std::make_pair(collapsed_shape, collapsed_strides);
+}
+
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const array& a,
+    int64_t size_cap /* = std::numeric_limits<int32_t>::max()*/) {
+  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -8,12 +8,9 @@

 namespace mlx::core {

-template <typename stride_t>
-inline stride_t elem_to_loc(
-    int elem,
-    const std::vector<int>& shape,
-    const std::vector<stride_t>& strides) {
-  stride_t loc = 0;
+inline int64_t
+elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
+  int64_t loc = 0;
  for (int i = shape.size() - 1; i >= 0; --i) {
    auto q_and_r = ldiv(elem, shape[i]);
    loc += q_and_r.rem * strides[i];
@@ -22,16 +19,15 @@ inline stride_t elem_to_loc(
  return loc;
 }

-inline size_t elem_to_loc(int elem, const array& a) {
+inline int64_t elem_to_loc(int elem, const array& a) {
  if (a.flags().row_contiguous) {
    return elem;
  }
  return elem_to_loc(elem, a.shape(), a.strides());
 }

-template <typename stride_t>
-std::vector<stride_t> make_contiguous_strides(const std::vector<int>& shape) {
-  std::vector<stride_t> strides(shape.size(), 1);
+inline Strides make_contiguous_strides(const Shape& shape) {
+  Strides strides(shape.size(), 1);
  for (int i = shape.size() - 1; i > 0; i--) {
    strides[i - 1] = strides[i] * shape[i];
  }
@@ -44,58 +40,19 @@ std::vector<stride_t> make_contiguous_strides(const std::vector<int>& shape) {
 //
 // When multiple arrays are passed they should all have the same shape. The
 // collapsed axes are also the same so one shape is returned.
-template <typename stride_t>
-inline std::tuple<std::vector<int>, std::vector<std::vector<stride_t>>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<stride_t>> strides) {
-  // Make a vector that has axes separated with -1. Collapse all axes between
-  // -1.
-  std::vector<int> to_collapse;
-  if (shape.size() > 0) {
-    to_collapse.push_back(0);
-    for (int i = 1; i < shape.size(); i++) {
-      bool contiguous = true;
-      for (const std::vector<stride_t>& st : strides) {
-        if (st[i] * shape[i] != st[i - 1]) {
-          contiguous = false;
-        }
-        if (!contiguous) {
-          break;
-        }
-      }
-      if (!contiguous) {
-        to_collapse.push_back(-1);
-      }
-      to_collapse.push_back(i);
-    }
-    to_collapse.push_back(-1);
-  }
+std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
+    const Shape& shape,
+    const std::vector<Strides>& strides,
+    int64_t size_cap = std::numeric_limits<int32_t>::max());

-  std::vector<int> out_shape;
-  std::vector<std::vector<stride_t>> out_strides(strides.size());
-  for (int i = 0; i < to_collapse.size(); i++) {
-    int current_shape = shape[to_collapse[i]];
-    while (to_collapse[++i] != -1) {
-      current_shape *= shape[to_collapse[i]];
-    }
-    out_shape.push_back(current_shape);
-    for (int j = 0; j < strides.size(); j++) {
-      const std::vector<stride_t>& st = strides[j];
-      out_strides[j].push_back(st[to_collapse[i - 1]]);
-    }
-  }
-
-  return std::make_tuple(out_shape, out_strides);
-}
-
-inline std::tuple<std::vector<int>, std::vector<std::vector<size_t>>>
-collapse_contiguous_dims(const std::vector<array>& xs) {
-  std::vector<std::vector<size_t>> strides;
+inline std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
+    const std::vector<array>& xs,
+    size_t size_cap = std::numeric_limits<int32_t>::max()) {
+  std::vector<Strides> strides;
  for (auto& x : xs) {
    strides.emplace_back(x.strides());
  }
-  return collapse_contiguous_dims(xs[0].shape(), strides);
+  return collapse_contiguous_dims(xs[0].shape(), strides, size_cap);
 }

 template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
@@ -105,39 +62,79 @@ inline auto collapse_contiguous_dims(Arrays&&... xs) {
 }

 // The single array version of the above.
-inline std::tuple<std::vector<int>, std::vector<size_t>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
-  std::vector<int> collapsed_shape;
-  std::vector<size_t> collapsed_strides;
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const Shape& shape,
+    const Strides& strides,
+    int64_t size_cap = std::numeric_limits<int32_t>::max());
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const array& a,
+    int64_t size_cap = std::numeric_limits<int32_t>::max());

-  if (shape.size() > 0) {
-    collapsed_shape.push_back(shape[0]);
-    collapsed_strides.push_back(strides[0]);
-    for (int i = 1; i < shape.size(); i++) {
-      if (strides[i] * shape[i] != collapsed_strides.back() ||
-          collapsed_shape.back() * static_cast<size_t>(shape[i]) >
-              std::numeric_limits<int>::max()) {
-        collapsed_shape.push_back(shape[i]);
-        collapsed_strides.push_back(strides[i]);
-      } else {
-        collapsed_shape.back() *= shape[i];
-        collapsed_strides.back() = strides[i];
-      }
+struct ContiguousIterator {
+  inline void step() {
+    int dims = shape_.size();
+    if (dims == 0) {
+      return;
+    }
+    int i = dims - 1;
+    while (pos_[i] == (shape_[i] - 1) && i > 0) {
+      pos_[i] = 0;
+      loc -= (shape_[i] - 1) * strides_[i];
+      i--;
+    }
+    pos_[i]++;
+    loc += strides_[i];
+  }
+
+  void seek(int64_t n) {
+    loc = 0;
+    for (int i = shape_.size() - 1; i >= 0; --i) {
+      auto q_and_r = ldiv(n, shape_[i]);
+      loc += q_and_r.rem * strides_[i];
+      pos_[i] = q_and_r.rem;
+      n = q_and_r.quot;
    }
  }

-  return std::make_tuple(collapsed_shape, collapsed_strides);
-}
+  void reset() {
+    loc = 0;
+    std::fill(pos_.begin(), pos_.end(), 0);
+  }

-template <typename stride_t>
-inline auto check_contiguity(
-    const std::vector<int>& shape,
-    const std::vector<stride_t>& strides) {
+  ContiguousIterator() {};
+
+  explicit ContiguousIterator(const array& a)
+      : shape_(a.shape()), strides_(a.strides()) {
+    if (!shape_.empty()) {
+      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
+      pos_ = std::vector<int>(shape_.size(), 0);
+    }
+  }
+
+  explicit ContiguousIterator(
+      const Shape& shape,
+      const Strides& strides,
+      int dims)
+      : shape_(shape.begin(), shape.begin() + dims),
+        strides_(strides.begin(), strides.begin() + dims) {
+    if (!shape_.empty()) {
+      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
+      pos_ = Shape(shape_.size(), 0);
+    }
+  }
+
+  int64_t loc{0};
+
+ private:
+  Shape shape_;
+  Strides strides_;
+  Shape pos_;
+};
+
+inline auto check_contiguity(const Shape& shape, const Strides& strides) {
  size_t no_broadcast_data_size = 1;
-  size_t f_stride = 1;
-  size_t b_stride = 1;
+  int64_t f_stride = 1;
+  int64_t b_stride = 1;
  bool is_row_contiguous = true;
  bool is_col_contiguous = true;

@@ -162,4 +159,13 @@ inline bool is_donatable(const array& in, const array& out) {
      in.buffer_size() <= out.nbytes() + donation_extra;
 }

+void move_or_copy(const array& in, array& out);
+void move_or_copy(
+    const array& in,
+    array& out,
+    const Strides& strides,
+    array::Flags flags,
+    size_t data_size,
+    size_t offset = 0);
+
 } // namespace mlx::core
--- a/mlx/backend/metal/CMakeLists.txt
+++ b/mlx/backend/metal/CMakeLists.txt
@@ -1,99 +1,63 @@
 function(make_jit_source SRC_FILE)
-  # This function takes a metal header file,
-  # runs the C preprocessesor on it, and makes
-  # the processed contents available as a string in a C++ function
+  # This function takes a metal header file, runs the C preprocessesor on it,
+  # and makes the processed contents available as a string in a C++ function
  # mlx::core::metal::${SRC_NAME}()
  #
-  # To use the function, declare it in jit/includes.h and
-  # include jit/includes.h.
+  # To use the function, declare it in jit/includes.h and include
+  # jit/includes.h.
  #
-  # Additional arguments to this function are treated as dependencies
-  # in the Cmake build system.
+  # Additional arguments to this function are treated as dependencies in the
+  # Cmake build system.
  get_filename_component(SRC_NAME ${SRC_FILE} NAME)
  add_custom_command(
-    OUTPUT  jit/${SRC_NAME}.cpp
-    COMMAND /bin/bash
-              ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
-              ${CMAKE_CURRENT_BINARY_DIR}/jit
-              ${CMAKE_C_COMPILER}
-              ${PROJECT_SOURCE_DIR}
-              ${SRC_FILE}
-              "-DMLX_METAL_VERSION=${MLX_METAL_VERSION}"
-    DEPENDS make_compiled_preamble.sh
-            kernels/${SRC_FILE}.h
-            ${ARGN}
-  )
+    OUTPUT jit/${SRC_NAME}.cpp
+    COMMAND
+      /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
+      ${CMAKE_CURRENT_BINARY_DIR}/jit ${CMAKE_C_COMPILER} ${PROJECT_SOURCE_DIR}
+      ${SRC_FILE}
+    DEPENDS make_compiled_preamble.sh kernels/${SRC_FILE}.h ${ARGN})
  add_custom_target(${SRC_NAME} DEPENDS jit/${SRC_NAME}.cpp)
  add_dependencies(mlx ${SRC_NAME})
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp
-  )
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp)
 endfunction(make_jit_source)

 make_jit_source(
  utils
-  kernels/bf16.h
+  kernels/jit/bf16.h
+  kernels/metal_3_0/bf16.h
+  kernels/metal_3_1/bf16.h
+  kernels/bf16_math.h
  kernels/complex.h
-  kernels/defines.h
-)
-make_jit_source(
-  unary_ops
-  kernels/erf.h
-  kernels/expm1f.h
-)
+  kernels/defines.h)
+make_jit_source(unary_ops kernels/erf.h kernels/expm1f.h)
 make_jit_source(binary_ops)
 make_jit_source(ternary_ops)
-make_jit_source(
-  reduce_utils
-  kernels/atomic.h
-  kernels/reduction/ops.h
-)
-make_jit_source(scatter)
-make_jit_source(gather)
+make_jit_source(reduce_utils kernels/atomic.h kernels/reduction/ops.h)
+make_jit_source(scatter kernels/indexing.h)
+make_jit_source(gather kernels/indexing.h)
 make_jit_source(hadamard)

-if (MLX_METAL_JIT) 
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp
-  )
+if(MLX_METAL_JIT)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp)
  make_jit_source(arange)
  make_jit_source(copy)
  make_jit_source(unary)
  make_jit_source(binary)
  make_jit_source(binary_two)
-  make_jit_source(
-    fft
-    kernels/fft/radix.h
-    kernels/fft/readwrite.h
-  )
+  make_jit_source(fft kernels/fft/radix.h kernels/fft/readwrite.h)
  make_jit_source(ternary)
  make_jit_source(softmax)
  make_jit_source(scan)
  make_jit_source(sort)
  make_jit_source(
-    reduce
-    kernels/reduction/reduce_all.h
-    kernels/reduction/reduce_col.h
-    kernels/reduction/reduce_row.h
-    kernels/reduction/reduce_init.h
-  )
+    reduce kernels/reduction/reduce_all.h kernels/reduction/reduce_col.h
+    kernels/reduction/reduce_row.h kernels/reduction/reduce_init.h)
  make_jit_source(
-    steel/gemm/gemm
-    kernels/steel/utils.h
-    kernels/steel/gemm/loader.h
-    kernels/steel/gemm/mma.h
-    kernels/steel/gemm/params.h
-    kernels/steel/gemm/transforms.h
-  )
+    steel/gemm/gemm kernels/steel/utils.h kernels/steel/gemm/loader.h
+    kernels/steel/gemm/mma.h kernels/steel/gemm/params.h
+    kernels/steel/gemm/transforms.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_fused)
-  make_jit_source(
-    steel/gemm/kernels/steel_gemm_masked
-    kernels/steel/defines.h
-  )
+  make_jit_source(steel/gemm/kernels/steel_gemm_masked kernels/steel/defines.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
  make_jit_source(
    steel/conv/conv
@@ -104,63 +68,52 @@ if (MLX_METAL_JIT)
    kernels/steel/conv/params.h
    kernels/steel/conv/loader.h
    kernels/steel/conv/loaders/loader_channel_l.h
-    kernels/steel/conv/loaders/loader_channel_n.h
-  )
-  make_jit_source(
-    steel/conv/kernels/steel_conv
-  )
-  make_jit_source(
-    steel/conv/kernels/steel_conv_general
-    kernels/steel/defines.h
-    kernels/steel/conv/loaders/loader_general.h
-  )
+    kernels/steel/conv/loaders/loader_channel_n.h)
+  make_jit_source(steel/conv/kernels/steel_conv)
+  make_jit_source(steel/conv/kernels/steel_conv_general kernels/steel/defines.h
+                  kernels/steel/conv/loaders/loader_general.h)
  make_jit_source(quantized)
  make_jit_source(gemv_masked)
 else()
-  target_sources(
-    mlx
-    PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp
-  )
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp)
 endif()

 target_sources(
  mlx
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-)
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/resident.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)

-if (NOT MLX_METAL_PATH)
+if(NOT MLX_METAL_PATH)
  set(MLX_METAL_PATH ${CMAKE_CURRENT_BINARY_DIR}/kernels/)
 endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels)

-target_compile_definitions(
-  mlx PRIVATE METAL_PATH="${MLX_METAL_PATH}/mlx.metallib")
+target_compile_definitions(mlx
+                           PRIVATE METAL_PATH="${MLX_METAL_PATH}/mlx.metallib")
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -2,6 +2,7 @@
 #include "mlx/backend/metal/allocator.h"
 #include "mlx/backend/metal/metal.h"
 #include "mlx/backend/metal/metal_impl.h"
+#include "mlx/backend/metal/resident.h"

 #include <mach/vm_page_size.h>
 #include <unistd.h>
@@ -29,7 +30,7 @@ BufferCache::BufferCache(MTL::Device* device)
    : device_(device), head_(nullptr), tail_(nullptr), pool_size_(0) {}

 BufferCache::~BufferCache() {
-  auto thread_pool = metal::new_scoped_memory_pool();
+  auto pool = metal::new_scoped_memory_pool();
  clear();
 }

@@ -140,6 +141,7 @@ void BufferCache::remove_from_list(BufferCache::BufferHolder* to_remove) {

 MetalAllocator::MetalAllocator()
    : device_(device(mlx::core::Device::gpu).mtl_device()),
+      residency_set_(device_),
      buffer_cache_(device_) {
  auto memsize = std::get<size_t>(device_info()["memory_size"]);
  block_limit_ =
@@ -148,14 +150,18 @@ MetalAllocator::MetalAllocator()
      static_cast<size_t>(0.95 * device_->recommendedMaxWorkingSetSize()),
      block_limit_);
  max_pool_size_ = block_limit_;
+  device(mlx::core::Device::gpu)
+      .set_residency_set(residency_set_.mtl_residency_set());
 }

 size_t MetalAllocator::set_cache_limit(size_t limit) {
+  std::unique_lock lk(mutex_);
  std::swap(limit, max_pool_size_);
  return limit;
 };

 size_t MetalAllocator::set_memory_limit(size_t limit, bool relaxed) {
+  std::unique_lock lk(mutex_);
  std::swap(limit, block_limit_);
  relaxed_ = relaxed;
  gc_limit_ = std::min(
@@ -164,6 +170,13 @@ size_t MetalAllocator::set_memory_limit(size_t limit, bool relaxed) {
  return limit;
 };

+size_t MetalAllocator::set_wired_limit(size_t limit) {
+  std::unique_lock lk(mutex_);
+  std::swap(limit, wired_limit_);
+  residency_set_.resize(wired_limit_);
+  return limit;
+};
+
 Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
  // Metal doesn't like empty buffers
  if (size == 0) {
@@ -195,7 +208,7 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
      return Buffer{nullptr};
    }

-    auto thread_pool = metal::new_scoped_memory_pool();
+    auto pool = metal::new_scoped_memory_pool();

    // If we have a lot of memory pressure or are over the maximum cache size,
    // try to reclaim memory from the cache
@@ -205,7 +218,7 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {

    // Allocate new buffer if needed
    size_t res_opt = MTL::ResourceStorageModeShared;
-    res_opt |= MTL::ResourceHazardTrackingModeTracked;
+    res_opt |= MTL::ResourceHazardTrackingModeUntracked;
    lk.unlock();
    buf = device_->newBuffer(size, res_opt);
    lk.lock();
@@ -216,27 +229,34 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {

  // Maintain the cache below the requested limit
  if (get_cache_memory() >= max_pool_size_) {
-    auto thread_pool = metal::new_scoped_memory_pool();
+    auto pool = metal::new_scoped_memory_pool();
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }

+  residency_set_.insert(buf);
+
  return Buffer{static_cast<void*>(buf)};
 }

 void MetalAllocator::clear_cache() {
  std::unique_lock lk(mutex_);
+  auto pool = metal::new_scoped_memory_pool();
  buffer_cache_.clear();
 }

 void MetalAllocator::free(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
+  if (buf == nullptr) {
+    return;
+  }
  std::unique_lock lk(mutex_);
+  residency_set_.erase(buf);
  active_memory_ -= buf->length();
  if (get_cache_memory() < max_pool_size_) {
    buffer_cache_.recycle_to_cache(buf);
  } else {
    lk.unlock();
-    auto thread_pool = metal::new_scoped_memory_pool();
+    auto pool = metal::new_scoped_memory_pool();
    buf->release();
  }
 }
@@ -246,15 +266,9 @@ size_t MetalAllocator::size(Buffer buffer) const {
 }

 MetalAllocator& allocator() {
-  // By creating the |allocator_| on heap, the destructor of MetalAllocator will
-  // not be called on exit and all the buffers will be leaked. This is necessary
-  // because releasing buffers can take more than 30sec when the program holds a
-  // lot of RAM (for example inferencing a LLM), and it would feel frozen to
-  // users when exiting.
-  // TODO(zcbenz): Consider using the `base::NoDestructor` class from Chromium
-  // when applying this pattern to more places, or when introducing sanitizers
-  // to MLX.
-  // https://source.chromium.org/chromium/chromium/src/+/main:base/no_destructor.h
+  // By creating the |allocator_| on heap, the destructor of MetalAllocator
+  // will not be called on exit and buffers in the cache will be leaked. This
+  // can save some time at program exit.
  static MetalAllocator* allocator_ = new MetalAllocator;
  return *allocator_;
 }
@@ -265,6 +279,15 @@ size_t set_cache_limit(size_t limit) {
 size_t set_memory_limit(size_t limit, bool relaxed /* = true */) {
  return allocator().set_memory_limit(limit, relaxed);
 }
+size_t set_wired_limit(size_t limit) {
+  if (limit >
+      std::get<size_t>(device_info()["max_recommended_working_set_size"])) {
+    throw std::invalid_argument(
+        "[metal::set_wired_limit] Setting a wired limit larger than "
+        "the maximum working set size is not allowed.");
+  }
+  return allocator().set_wired_limit(limit);
+}
 size_t get_active_memory() {
  return allocator().get_active_memory();
 }
--- a/mlx/backend/metal/allocator.h
+++ b/mlx/backend/metal/allocator.h
@@ -8,6 +8,7 @@

 #include "mlx/allocator.h"
 #include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/resident.h"

 namespace mlx::core::metal {

@@ -72,6 +73,7 @@ class MetalAllocator : public allocator::Allocator {
  };
  size_t set_cache_limit(size_t limit);
  size_t set_memory_limit(size_t limit, bool relaxed);
+  size_t set_wired_limit(size_t limit);
  void clear_cache();

 private:
@@ -82,12 +84,15 @@ class MetalAllocator : public allocator::Allocator {
  // Caching allocator
  BufferCache buffer_cache_;

+  ResidencySet residency_set_;
+
  // Allocation stats
  size_t block_limit_;
  size_t gc_limit_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  size_t max_pool_size_;
+  size_t wired_limit_{0};
  bool relaxed_{true};

  std::mutex mutex_;
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -1,5 +1,4 @@
 // Copyright © 2024 Apple Inc.
-
 #include "mlx/backend/common/binary.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/backend/metal/kernels.h"
@@ -19,39 +18,41 @@

 namespace mlx::core {

-constexpr int MAX_BINARY_SPECIALIZED_DIMS = 5;
-
 std::string get_kernel_name(
    BinaryOpType bopt,
    const std::string& op,
    const array& a,
-    bool use_2d,
-    int ndim) {
-  std::ostringstream kname;
+    bool large,
+    int ndim,
+    int work_per_thread) {
+  std::string kname;
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
-      kname << "ss";
+      kname = "ss";
      break;
    case BinaryOpType::ScalarVector:
-      kname << (use_2d ? "sv2" : "sv");
+      kname = (large ? "sv2" : "sv");
      break;
    case BinaryOpType::VectorScalar:
-      kname << (use_2d ? "vs2" : "vs");
+      kname = (large ? "vs2" : "vs");
      break;
    case BinaryOpType::VectorVector:
-      kname << (use_2d ? "vv2" : "vv");
+      kname = (large ? "vv2" : "vv");
      break;
    case BinaryOpType::General:
-      kname << "g";
-      if (ndim <= MAX_BINARY_SPECIALIZED_DIMS) {
-        kname << ndim;
+      kname = "g";
+      if (ndim <= 3) {
+        kname += std::to_string(ndim);
      } else {
-        kname << "n";
+        concatenate(kname, "n", std::to_string(work_per_thread));
+      }
+      if (large) {
+        kname += "large";
      }
      break;
  }
-  kname << op << type_to_name(a);
-  return kname.str();
+  concatenate(kname, "_", op, type_to_name(a));
+  return kname;
 }

 void binary_op_gpu_inplace(
@@ -69,71 +70,86 @@ void binary_op_gpu_inplace(
  }

  // Try to collapse contiguous dims
-  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-  auto& strides_a = strides[0];
-  auto& strides_b = strides[1];
-  auto& strides_out = strides[2];
+  auto maybe_collapse = [bopt, &a, &b, &out]() {
+    if (bopt == BinaryOpType::General) {
+      auto [shape, strides] = collapse_contiguous_dims(a, b, out);
+      return std::make_tuple(shape, strides[0], strides[1], strides[2]);
+    } else {
+      decltype(a.strides()) e{};
+      return std::make_tuple(decltype(a.shape()){}, e, e, e);
+    }
+  };
+  auto [shape, strides_a, strides_b, strides_out] = maybe_collapse();

-  bool use_2d = out.data_size() > UINT32_MAX;
-  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
+  bool large = out.data_size() > UINT32_MAX;
+  auto ndim = shape.size();
+  int work_per_thread;
+  if (bopt == BinaryOpType::General) {
+    large |= (a.data_size() > UINT32_MAX || b.data_size() > UINT32_MAX);
+    work_per_thread = large ? 4 : 2;
+  } else {
+    work_per_thread = 1;
+  }
+  std::string kernel_name =
+      get_kernel_name(bopt, op, a, large, shape.size(), work_per_thread);
  auto& d = metal::device(s.device);

-  auto kernel =
-      get_binary_two_kernel(d, kernel_name, a.dtype(), outputs[0].dtype(), op);
-
+  auto kernel = outputs.size() == 2
+      ? get_binary_two_kernel(d, kernel_name, a.dtype(), out.dtype(), op)
+      : get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // - If a is donated it goes to the first output
  // - If b is donated it goes to the first output if a was not donated
-  //   otherwise it goes to the second output
+  //   otherwise it goes to the second output.
+  // - If there is only one output only one of a and b will be donated.
  bool donate_a = a.data_shared_ptr() == nullptr;
  bool donate_b = b.data_shared_ptr() == nullptr;
-  compute_encoder.set_input_array(donate_a ? outputs[0] : a, 0);
+  int arg_idx = 0;
+  compute_encoder.set_input_array(donate_a ? outputs[0] : a, arg_idx++);
  compute_encoder.set_input_array(
-      donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, 1);
-  compute_encoder.set_output_array(outputs[0], 2);
-  compute_encoder.set_output_array(outputs[1], 3);
+      donate_b ? (donate_a ? outputs[1] : outputs[0]) : b, arg_idx++);
+  compute_encoder.set_output_array(outputs[0], arg_idx++);
+  if (outputs.size() == 2) {
+    compute_encoder.set_output_array(outputs[1], arg_idx++);
+  }

+  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (bopt == BinaryOpType::General) {
-    auto ndim = shape.size();
-    if (ndim > 3) {
-      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 4);
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 5);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 6);
-    } else {
-      // The shape is implicit in the grid for <= 3D
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
-    }
-
-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
-      compute_encoder->setBytes(&ndim, sizeof(int), 7);
-    }
-
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+
+    if (ndim > 3) {
+      compute_encoder.set_vector_bytes(shape, arg_idx++);
+      compute_encoder.set_vector_bytes(strides_a, arg_idx++);
+      compute_encoder.set_vector_bytes(strides_b, arg_idx++);
+      compute_encoder.set_bytes<int>(ndim, arg_idx++);
+      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+    } else {
+      // The shape is implicit in the grid for <= 3D
+      compute_encoder.set_vector_bytes(strides_a, arg_idx++);
+      compute_encoder.set_vector_bytes(strides_b, arg_idx++);
+    }
+
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    // Launch a 1D or 2D grid of threads
    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d
-        ? get_2d_grid_dims(outputs[0].shape(), outputs[0].strides())
-        : MTL::Size(nthreads, 1, 1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    MTL::Size grid_dims = large ? get_2d_grid_dims(out.shape(), out.strides())
+                                : MTL::Size(nthreads, 1, 1);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
 }

@@ -164,72 +180,8 @@ void binary_op_gpu_inplace(
    array& out,
    const std::string& op,
    const Stream& s) {
-  auto& a = inputs[0];
-  auto& b = inputs[1];
-  auto bopt = get_binary_op_type(a, b);
-  if (out.size() == 0) {
-    return;
-  }
-
-  // Try to collapse contiguous dims
-  auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-  auto& strides_a = strides[0];
-  auto& strides_b = strides[1];
-  auto& strides_out = strides[2];
-
-  bool use_2d = out.data_size() > UINT32_MAX;
-  std::string kernel_name = get_kernel_name(bopt, op, a, use_2d, shape.size());
-  auto& d = metal::device(s.device);
-
-  auto kernel = get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
-  bool donate_a = a.data_shared_ptr() == nullptr;
-  bool donate_b = b.data_shared_ptr() == nullptr;
-  compute_encoder.set_input_array(donate_a ? out : a, 0);
-  compute_encoder.set_input_array(donate_b ? out : b, 1);
-  compute_encoder.set_output_array(out, 2);
-
-  if (bopt == BinaryOpType::General) {
-    auto ndim = shape.size();
-    if (ndim > 3) {
-      compute_encoder->setBytes(shape.data(), ndim * sizeof(int), 3);
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 4);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 5);
-    } else {
-      // The shape is implicit in the grid for <= 3D
-      compute_encoder->setBytes(strides_a.data(), ndim * sizeof(size_t), 3);
-      compute_encoder->setBytes(strides_b.data(), ndim * sizeof(size_t), 4);
-    }
-
-    if (ndim > MAX_BINARY_SPECIALIZED_DIMS) {
-      compute_encoder->setBytes(&ndim, sizeof(int), 6);
-    }
-
-    // Launch up to 3D grid of threads
-    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
-    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
-    size_t rest = out.size() / (dim0 * dim1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size != 1024) {
-      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
-    }
-    auto group_dims = get_block_dims(dim0, dim1, rest);
-    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  } else {
-    // Launch a 1D or 2D grid of threads
-
-    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
-                                 : MTL::Size(nthreads, 1, 1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size > nthreads) {
-      thread_group_size = nthreads;
-    }
-    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-  }
+  std::vector<array> outputs = {out};
+  binary_op_gpu_inplace(inputs, outputs, op, s);
 }

 void binary_op_gpu(
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -1,5 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.
-
+#include <fmt/format.h>
+#include <iostream> //TODO
 #include <sstream>

 #include "mlx/backend/common/compiled.h"
@@ -11,10 +12,12 @@
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

+using namespace fmt::literals;
+
 namespace mlx::core {

 inline void build_kernel(
-    std::ostream& os,
+    std::string& os,
    const std::string& kernel_name,
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
@@ -22,7 +25,9 @@ inline void build_kernel(
    const std::unordered_set<uintptr_t>& constant_ids,
    bool contiguous,
    int ndim,
-    bool dynamic_dims) {
+    bool dynamic_dims,
+    bool use_big_index = false,
+    int work_per_thread = 1) {
  // All outputs should have the exact same shape and will be row contiguous
  auto output_shape = outputs[0].shape();
  auto output_strides = outputs[0].strides();
@@ -37,8 +42,8 @@ inline void build_kernel(
  int cnt = 0;

  // Start the kernel
-  os << "[[host_name(\"" << kernel_name << "\")]]" << std::endl
-     << "[[kernel]] void " << kernel_name << "(" << std::endl;
+  os += fmt::format(
+      "[[host_name(\"{0}\")]]\n[[kernel]] void {0}(\n", kernel_name);

  // Add the input arguments
  for (auto& x : inputs) {
@@ -50,129 +55,201 @@ inline void build_kernel(
    }

    // Scalars and contiguous need no strides
-    if (is_scalar(x) || contiguous) {
-      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
-         << " [[buffer(" << cnt++ << ")]]," << std::endl;
-    } else {
+    if (!is_scalar(x) && !contiguous) {
      add_indices = true;
-      os << "    device const " << get_type_string(x.dtype()) << "* " << xname
-         << " [[buffer(" << cnt++ << ")]]," << std::endl;
    }
+    os += fmt::format(
+        "    device const {0}* {1} [[buffer({2})]],\n",
+        get_type_string(x.dtype()),
+        xname,
+        cnt++);
  }

  if (add_indices) {
-    os << "    constant const size_t* in_strides [[buffer(" << cnt++
-       << ")]],\n";
+    os += fmt::format(
+        "    constant const int64_t* in_strides [[buffer({0})]],\n", cnt++);
  }

  // Add the output arguments
  for (auto& x : outputs) {
-    os << "    device " << get_type_string(x.dtype()) << "* "
-       << namer.get_name(x) << " [[buffer(" << cnt++ << ")]]," << std::endl;
+    os += fmt::format(
+        "    device {0}* {1} [[buffer({2})]],\n",
+        get_type_string(x.dtype()),
+        namer.get_name(x),
+        cnt++);
  }
  // Add output strides and shape to extract the indices.
  if (!contiguous) {
-    os << "    constant const size_t* output_strides [[buffer(" << cnt++
-       << ")]]," << std::endl
-       << "    constant const int* output_shape [[buffer(" << cnt++ << ")]],"
-       << std::endl;
+    os += fmt::format(
+        "    constant const int64_t* output_strides [[buffer({0})]],\n", cnt++);
+    os += fmt::format(
+        "    constant const int* output_shape [[buffer({0})]],\n", cnt++);
  }
  if (dynamic_dims) {
-    os << "    constant const int& ndim [[buffer(" << cnt++ << ")]],"
-       << std::endl;
+    os += fmt::format("    constant const int& ndim [[buffer({0})]],\n", cnt++);
  }

  // The thread index in the whole grid
-  os << "    uint3 pos [[thread_position_in_grid]]," << std::endl
-     << "    uint3 grid [[threads_per_grid]]) {" << std::endl
-     << "  uint index = pos.x + grid.x * (pos.y + grid.y * pos.z);"
-     << std::endl;
+  os += "    uint3 pos [[thread_position_in_grid]],\n";
+  os += "    uint3 grid [[threads_per_grid]]) {\n";

-  // Extract the indices per axis to individual uints if we have arrays that
-  // are broadcasted or transposed
-  if (add_indices) {
-    if (!dynamic_dims) {
-      if (ndim == 1) {
-        os << "  uint index_0 = pos.x;" << std::endl;
-      } else if (ndim == 2) {
-        os << "  uint index_0 = pos.y;" << std::endl
-           << "  uint index_1 = pos.x;" << std::endl;
-      } else if (ndim == 3) {
-        os << "  uint index_0 = pos.z;" << std::endl
-           << "  uint index_1 = pos.y;" << std::endl
-           << "  uint index_2 = pos.x;" << std::endl;
-      } else {
-        for (int i = 0; i < ndim - 2; i++) {
-          os << "  uint index_" << i << " = (index / uint(output_strides[" << i
-             << "])) % output_shape[" << i << "];" << std::endl;
-        }
-        os << "  uint index_" << ndim - 2 << " = pos.y;" << std::endl
-           << "  uint index_" << ndim - 1 << " = pos.x;" << std::endl;
-      }
-    }
+  std::string idx_type = use_big_index ? "int64_t" : "uint";
+  if (contiguous && use_big_index) {
+    // This is only used for contiguous kernels which don't have
+    // a third grid dimension
+    os += "  int64_t index = pos.x + grid.x * int64_t(pos.y);\n";
+  } else if (work_per_thread > 1) {
+    os += fmt::format("  constexpr int N_ = {0};\n", work_per_thread);
+    os += fmt::format(
+        "  int xshape = output_shape[{0}];\n",
+        dynamic_dims ? "ndim - 1" : std::to_string(ndim - 1));
+    os += fmt::format(
+        "  {0} index = N_ * pos.x + xshape * (pos.y + {0}(grid.y) * pos.z);\n",
+        idx_type);
+  } else {
+    os += fmt::format(
+        "  {0} index = pos.x + grid.x * (pos.y + {0}(grid.y) * pos.z);\n",
+        idx_type);
  }

-  // Read the inputs in tmps
-  int nc_in_count = 0;
+  // Read constant / contiguous inputs in tmps
+  std::vector<array> nc_inputs;
  for (int i = 0; i < inputs.size(); ++i) {
    auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    if (is_constant(x)) {
      auto type_str = get_type_string(x.dtype());
-      os << "  auto tmp_" << xname << " = static_cast<"
-         << get_type_string(x.dtype()) << ">(";
-      print_constant(os, x);
-      os << ");" << std::endl;
+      std::ostringstream ss;
+      print_constant(ss, x);
+      os += fmt::format(
+          "  auto tmp_{0} = static_cast<{1}>({2});\n",
+          xname,
+          get_type_string(x.dtype()),
+          ss.str());
    } else if (is_scalar(x)) {
-      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[0];" << std::endl;
+      os += fmt::format(
+          "  {0} tmp_{1} = {1}[0];\n", get_type_string(x.dtype()), xname);
    } else if (contiguous) {
-      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[index];" << std::endl;
-    } else if (!dynamic_dims) {
-      int offset = nc_in_count * ndim;
-      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[";
-      os << "index_0 * " << "in_strides[" << offset << "]";
-      for (int i = 1; i < ndim; i++) {
-        os << " + index_" << i << " * " << "in_strides[" << offset + i << "]";
-      }
-      os << "];" << std::endl;
-      nc_in_count++;
+      os += fmt::format(
+          "  {0} tmp_{1} = {1}[index];\n", get_type_string(x.dtype()), xname);
    } else {
-      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
-         << xname << "[elem_to_loc(index, output_shape, in_strides + "
-         << nc_in_count * ndim << ", ndim)];" << std::endl;
-      nc_in_count++;
+      nc_inputs.push_back(x);
    }
  }

+  // Initialize the indices for non-contiguous inputs
+  for (int i = 0; i < nc_inputs.size(); ++i) {
+    auto& xname = namer.get_name(nc_inputs[i]);
+    os += fmt::format("  {0} index_{1} = ", idx_type, xname);
+    if (ndim == 1) {
+      int offset = i * ndim;
+      os +=
+          fmt::format("elem_to_loc_1<uint>(pos.x, in_strides[{0}]);\n", offset);
+    } else if (ndim == 2) {
+      int offset = i * ndim;
+      os += fmt::format(
+          "elem_to_loc_2<{0}>({{pos.x, pos.y}}, in_strides + {1});\n",
+          idx_type,
+          offset);
+    } else if (ndim == 3) {
+      int offset = i * ndim;
+      os += fmt::format(
+          "elem_to_loc_3<{0}>(pos, in_strides + {1});\n", idx_type, offset);
+    } else if (!dynamic_dims) {
+      int offset = (i + 1) * ndim;
+      os += fmt::format(
+          "N_ * pos.x * {0}(in_strides[{1}]) + pos.y * {0}(in_strides[{2}]);\n",
+          idx_type,
+          offset - 1,
+          offset - 2);
+    } else {
+      os += fmt::format(
+          "N_ * pos.x * {0}(in_strides[ndim * {1} + ndim - 1]) + pos.y * {0}(in_strides[ndim * {1} + ndim - 2]);\n",
+          idx_type,
+          i);
+    }
+  }
+
+  if (!nc_inputs.empty() && (ndim > 3 || dynamic_dims)) {
+    os += "  uint zpos = pos.z;\n";
+    if (dynamic_dims) {
+      os += "  for (int d = ndim - 3; d >= 0; --d) {\n";
+    } else {
+      os += fmt::format("  for (int d = {0}; d >= 0; --d) {{\n", ndim - 3);
+    }
+    os += "    uint l = zpos % output_shape[d];\n";
+    for (int i = 0; i < nc_inputs.size(); ++i) {
+      auto& xname = namer.get_name(nc_inputs[i]);
+      os += fmt::format("    index_{0} += ", xname);
+      if (dynamic_dims) {
+        os +=
+            fmt::format("l * {0}(in_strides[{1} * ndim + d]);\n", idx_type, i);
+      } else {
+        os +=
+            fmt::format("l * {0}(in_strides[{1} + d]);\n", idx_type, i * ndim);
+      }
+    }
+    os += "    zpos /= output_shape[d];\n  }\n";
+  }
+
+  // Open per-thread loop
+  if (work_per_thread > 1) {
+    os +=
+        "  for (int i = 0; i < N_ && (int(N_ * pos.x) + i) < xshape; ++i) {\n";
+  }
+
+  // Read non-contiguous inputs into tmps
+  for (int i = 0; i < nc_inputs.size(); ++i) {
+    auto& x = nc_inputs[i];
+    auto& xname = namer.get_name(x);
+    os += fmt::format(
+        "  {0} tmp_{1} = {1}[index_{1}];\n", get_type_string(x.dtype()), xname);
+  }
+
  // Actually write the computation
  for (auto& x : tape) {
-    os << "  " << get_type_string(x.dtype()) << " tmp_" << namer.get_name(x)
-       << " = ";
+    os += fmt::format(
+        "  {0} tmp_{1} = ", get_type_string(x.dtype()), namer.get_name(x));
    if (is_static_cast(x.primitive())) {
-      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
-         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
+      os += fmt::format(
+          "static_cast<{0}>(tmp_{1});\n",
+          get_type_string(x.dtype()),
+          namer.get_name(x.inputs()[0]));
    } else {
-      x.primitive().print(os);
-      os << "()(";
+      std::ostringstream ss;
+      x.primitive().print(ss);
+      os += ss.str();
+      os += "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
-        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
+        os += fmt::format("tmp_{0}, ", namer.get_name(x.inputs()[i]));
      }
-      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
+      os += fmt::format("tmp_{0});\n", namer.get_name(x.inputs().back()));
    }
  }

  // Write the outputs from tmps
  for (auto& x : outputs) {
-    os << "  " << namer.get_name(x) << "[index] = tmp_" << namer.get_name(x)
-       << ";" << std::endl;
+    os += fmt::format("  {0}[index] = tmp_{0};\n", namer.get_name(x));
+  }
+  // Increment indices and close per thread loop
+  if (work_per_thread > 1) {
+    for (int i = 0; i < nc_inputs.size(); ++i) {
+      auto& x = nc_inputs[i];
+      auto& xname = namer.get_name(x);
+      if (!dynamic_dims) {
+        os += fmt::format(
+            "  index_{0} += in_strides[{1}];\n", xname, i * ndim + ndim - 1);
+      } else {
+        os += fmt::format(
+            "  index_{0} += in_strides[{1} * ndim + ndim - 1];\n", xname, i);
+      }
+    }
+    os += "  index++;\n  }\n";
  }

  // Finish the kernel
-  os << "}" << std::endl;
+  os += "}\n";

  if (cnt > 31) {
    std::ostringstream msg;
@@ -195,13 +272,10 @@ void Compiled::eval_gpu(
  // Get the kernel if someone else built it already
  auto& s = stream();
  auto& d = metal::device(s.device);
-  auto lib = d.get_library(kernel_lib_);
-
-  // If not we have to build it ourselves
-  if (lib == nullptr) {
-    std::ostringstream kernel;
-    kernel << metal::utils() << metal::unary_ops() << metal::binary_ops()
-           << metal::ternary_ops();
+  auto lib = d.get_library(kernel_lib_, [&]() {
+    std::string kernel = metal::utils();
+    concatenate(
+        kernel, metal::unary_ops(), metal::binary_ops(), metal::ternary_ops());
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
@@ -212,6 +286,17 @@ void Compiled::eval_gpu(
        /* contiguous = */ true,
        /* ndim = */ 0,
        /* dynamic_dims = */ false);
+    build_kernel(
+        kernel,
+        kernel_lib_ + "_contiguous_large",
+        inputs_,
+        outputs_,
+        tape_,
+        constant_ids_,
+        /* contiguous = */ true,
+        /* ndim = */ 0,
+        /* dynamic_dims = */ false,
+        /* use_big_index = */ true);
    for (int i = 1; i < 8; i++) {
      build_kernel(
          kernel,
@@ -222,7 +307,23 @@ void Compiled::eval_gpu(
          constant_ids_,
          /* contiguous = */ false,
          /* ndim = */ i,
-          /* dynamic_dims = */ false);
+          /* dynamic_dims = */ false,
+          /* use_big_index = */ false,
+          /* work_per_thread = */ i > 3 ? 2 : 1);
+      if (i > 1) {
+        build_kernel(
+            kernel,
+            kernel_lib_ + "_strided_" + std::to_string(i) + "_large",
+            inputs_,
+            outputs_,
+            tape_,
+            constant_ids_,
+            /* contiguous = */ false,
+            /* ndim = */ i,
+            /* dynamic_dims = */ false,
+            /* use_big_index = */ true,
+            /* work_per_thread = */ i > 3 ? 4 : 1);
+      }
    }
    build_kernel(
        kernel,
@@ -233,21 +334,34 @@ void Compiled::eval_gpu(
        constant_ids_,
        /* contiguous = */ false,
        /* ndim = */ 0,
-        /* dynamic_dims = */ true);
-
-    lib = d.get_library(kernel_lib_, kernel.str());
-  }
+        /* dynamic_dims = */ true,
+        /* use_big_index = */ false,
+        /* work_per_thread = */ 2);
+    build_kernel(
+        kernel,
+        kernel_lib_ + "_strided_dynamic_large",
+        inputs_,
+        outputs_,
+        tape_,
+        constant_ids_,
+        /* contiguous = */ false,
+        /* ndim = */ 0,
+        /* dynamic_dims = */ true,
+        /* use_big_index = */ true,
+        /* work_per_thread = */ 4);
+    return kernel;
+  });

  // Figure out which kernel we are using
  auto& output_shape = outputs[0].shape();
-  bool contiguous = compiled_check_contiguity(inputs, output_shape);
+  auto contiguous = compiled_check_contiguity(inputs, output_shape);

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
-  std::vector<std::vector<size_t>> initial_strides;
+  std::vector<Strides> initial_strides;
  initial_strides.push_back(outputs[0].strides());
-  std::vector<int> shape;
-  std::vector<std::vector<size_t>> strides;
+  Shape shape;
+  std::vector<Strides> strides;
  if (!contiguous) {
    for (int i = 0; i < inputs.size(); i++) {
      // Skip constants.
@@ -262,7 +376,7 @@ void Compiled::eval_gpu(
      }

      // Broadcast the inputs to the output shape.
-      std::vector<size_t> xstrides;
+      Strides xstrides;
      int j = 0;
      for (; j < output_shape.size() - x.ndim(); j++) {
        if (output_shape[j] == 1) {
@@ -285,7 +399,22 @@ void Compiled::eval_gpu(
      initial_strides.push_back(std::move(xstrides));
    }
    std::tie(shape, strides) =
-        collapse_contiguous_dims(output_shape, initial_strides);
+        collapse_contiguous_dims(output_shape, initial_strides, INT32_MAX);
+  }
+
+  bool large;
+  if (contiguous) {
+    size_t max_size = 0;
+    for (auto& in : inputs) {
+      max_size = std::max(max_size, in.data_size());
+    }
+    large = (max_size > UINT32_MAX);
+  } else {
+    size_t max_size = 0;
+    for (auto& o : outputs) {
+      max_size = std::max(max_size, o.size());
+    }
+    large = (max_size > UINT32_MAX);
  }

  // Get the kernel from the lib
@@ -299,14 +428,17 @@ void Compiled::eval_gpu(
      kernel_name += std::to_string(shape.size());
    }
  }
+  if (large) {
+    kernel_name += "_large";
+  }
  auto kernel = d.get_kernel(kernel_name, lib);
  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
-  std::vector<size_t> in_strides;
+  Strides in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
@@ -322,8 +454,7 @@ void Compiled::eval_gpu(
    }
  }
  if (!in_strides.empty()) {
-    compute_encoder->setBytes(
-        in_strides.data(), in_strides.size() * sizeof(size_t), cnt++);
+    compute_encoder.set_vector_bytes(in_strides, cnt++);
  }

  compiled_allocate_outputs(
@@ -336,34 +467,43 @@ void Compiled::eval_gpu(

  // Put the output shape and strides in
  if (!contiguous) {
-    compute_encoder->setBytes(
-        strides[0].data(), strides[0].size() * sizeof(size_t), cnt++);
-    compute_encoder->setBytes(shape.data(), shape.size() * sizeof(int), cnt++);
+    compute_encoder.set_vector_bytes(strides[0], cnt++);
+    compute_encoder.set_vector_bytes(shape, cnt++);
  }

  // Put the number of dims in if it is dynamic
  if (dynamic) {
-    compute_encoder->setBytes(&ndim, sizeof(int), cnt++);
+    compute_encoder.set_bytes(ndim, cnt++);
  }

  // Launch the kernel
  if (contiguous) {
-    size_t nthreads = outputs[0].size();
-    MTL::Size grid_dims(nthreads, 1, 1);
+    size_t nthreads = outputs[0].data_size();
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+
+    MTL::Size grid_dims = large
+        ? get_2d_grid_dims(outputs[0].shape(), outputs[0].strides())
+        : MTL::Size(nthreads, 1, 1);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = outputs[0].size() / (dim0 * dim1);
+    int work_per_thread = ndim > 3 ? (large ? 4 : 2) : 1;
+    dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
-    if (thread_group_size != 1024) {
-      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
+    int pow2;
+    if (thread_group_size == 1024) {
+      pow2 = 10;
+    } else if (thread_group_size > 512) {
+      pow2 = 9;
+    } else {
+      throw std::runtime_error("[Metal::compiled] Must use > 512 sized block");
    }
-    auto group_dims = get_block_dims(dim0, dim1, rest);
+    auto group_dims = get_block_dims(dim0, dim1, rest, pow2);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
 }

--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -44,27 +44,28 @@ void explicit_gemm_conv_ND_gpu(
  kname << "naive_unfold_nd_" << type_to_name(in_unfolded) << "_" << N;
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(in_unfolded, 1);

-  compute_encoder->setBytes(&conv_params, sizeof(conv_params), 2);
+  compute_encoder.set_bytes(conv_params, 2);

  // Launch unfolding kernel
-  int tgp_x = std::min(conv_params.C, 64);
+  size_t tgp_x = std::min(conv_params.C, 64);
  tgp_x = 32 * ((tgp_x + 32 - 1) / 32);
-  int tgp_y = 256 / tgp_x;
+  size_t tgp_y = 256 / tgp_x;

-  MTL::Size group_dims = MTL::Size(tgp_x, tgp_y, 1);
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
+  MTL::Size group_dims = MTL::Size(
+      std::min(tgp_x, grid_dims.width), std::min(tgp_y, grid_dims.height), 1);

-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Reshape weight
-  std::vector<int> wt_reshape{implicit_K, implicit_N};
-  std::vector<size_t> wt_restride{1, static_cast<size_t>(implicit_K)};
+  Shape wt_reshape{implicit_K, implicit_N};
+  Strides wt_restride{1, implicit_K};
  array wt_reshaped(wt_reshape, wt.dtype(), nullptr, {});
  auto wt_flags = wt.flags();
  wt_flags.row_contiguous = false;
@@ -72,7 +73,7 @@ void explicit_gemm_conv_ND_gpu(
  wt_reshaped.copy_shared_buffer(wt, wt_restride, wt_flags, wt.data_size());

  // Perform gemm
-  std::vector<array> copies = {in_unfolded, wt_reshaped};
+  std::vector<array> copies = {in_unfolded};
  return steel_matmul(
      s,
      d,
@@ -122,55 +123,58 @@ void explicit_gemm_conv_group_ND_gpu(
        << N;
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(in_unfolded, 1);

-  compute_encoder->setBytes(&conv_params, sizeof(conv_params), 2);
+  compute_encoder.set_bytes(conv_params, 2);

  // Launch unfolding kernel
-  int tgp_x = std::min(conv_params.C, 64);
+  size_t tgp_x = std::min(conv_params.C, 64);
  tgp_x = 32 * ((tgp_x + 32 - 1) / 32);
-  int tgp_y = 256 / tgp_x;
+  size_t tgp_y = 256 / tgp_x;

-  MTL::Size group_dims = MTL::Size(tgp_x, tgp_y, 1);
  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
+  MTL::Size group_dims = MTL::Size(
+      std::min(tgp_x, grid_dims.width), std::min(tgp_y, grid_dims.height), 1);

-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Transpose kernel weights so that we can slice them by contiguous chunks
  // of channel groups.
  array wt_view(
      {wt.shape(0), C_per_group, kernel_size}, wt.dtype(), nullptr, {});
  wt_view.copy_shared_buffer(
-      wt,
-      {wt.strides(0), 1, static_cast<size_t>(C_per_group)},
-      wt.flags(),
-      wt.size());
+      wt, {wt.strides(0), 1, C_per_group}, wt.flags(), wt.size());

  // Materialize
  auto wt_transpose = array(wt_view.shape(), wt_view.dtype(), nullptr, {});
  copy_gpu(wt_view, wt_transpose, CopyType::General, s);

  // Perform gemm
-  std::vector<array> copies = {in_unfolded, wt_view, wt_transpose};
-  return steel_matmul_conv_groups(
+  std::vector<array> copies = {in_unfolded, wt_transpose};
+  return steel_matmul_regular(
      s,
      d,
-      /*a = */ in_unfolded,
-      /*b = */ wt_transpose,
-      /*c = */ out,
-      /*M = */ implicit_M,
-      /*N = */ implicit_N,
-      /*K = */ implicit_K,
-      /*a_cols = */ implicit_K * groups,
-      /*b_cols = */ implicit_K,
-      /*out_cols = */ implicit_N * groups,
-      /*a_transposed = */ false,
-      /*b_transposed = */ true,
-      /* groups = */ groups,
+      /* a = */ in_unfolded,
+      /* b = */ wt_transpose,
+      /* c = */ out,
+      /* M = */ implicit_M,
+      /* N = */ implicit_N,
+      /* K = */ implicit_K,
+      /* batch_size_out = */ groups,
+      /* a_cols = */ implicit_K * groups,
+      /* b_cols = */ implicit_K,
+      /* out_cols = */ implicit_N * groups,
+      /* a_transposed = */ false,
+      /* b_transposed = */ true,
+      /* batch_shape = */ {1},
+      /* batch_strides = */ {0},
+      /* A_batch_strides = */ size_t(implicit_K),
+      /* B_batch_strides = */ size_t(implicit_N) * implicit_K,
+      /* matrix_stride_out = */ size_t(implicit_N),
      /*copies = */ copies);
 }

@@ -232,7 +236,7 @@ void slow_conv_2D_gpu(
  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  size_t n_pixels = conv_params.oS[0] * conv_params.oS[1];

@@ -247,8 +251,8 @@ void slow_conv_2D_gpu(
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);

-  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.set_bytes(conv_params, 3);
+  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_gpu(
@@ -347,7 +351,7 @@ void implicit_gemm_conv_2D_gpu(
      wn,
      n_channel_specialization,
      small_filter);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // Deduce grid launch dimensions
  int tile = 1 << swizzle_log;
@@ -363,11 +367,11 @@ void implicit_gemm_conv_2D_gpu(
  compute_encoder.set_output_array(out, 2);

  // Encode params
-  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder->setBytes(&gemm_params, sizeof(ImplicitGemmConv2DParams), 4);
+  compute_encoder.set_bytes(conv_params, 3);
+  compute_encoder.set_bytes(gemm_params, 4);

  // Launch kernel
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

 void implicit_gemm_conv_2D_general_gpu(
@@ -501,7 +505,7 @@ void implicit_gemm_conv_2D_general_gpu(
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel =
      get_steel_conv_general_kernel(d, kname.str(), out, bm, bn, bk, wm, wn);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // Deduce grid launch dimensions
  int tile = 1 << swizzle_log;
@@ -518,17 +522,15 @@ void implicit_gemm_conv_2D_general_gpu(
  compute_encoder.set_output_array(out, 2);

  // Encode params
-  compute_encoder->setBytes(&conv_params, sizeof(MLXConvParams<2>), 3);
-  compute_encoder->setBytes(&gemm_params, sizeof(ImplicitGemmConv2DParams), 4);
-  compute_encoder->setBytes(&jump_params, sizeof(Conv2DGeneralJumpParams), 5);
+  compute_encoder.set_bytes(conv_params, 3);
+  compute_encoder.set_bytes(gemm_params, 4);
+  compute_encoder.set_bytes(jump_params, 5);

-  compute_encoder->setBytes(
-      base_h.data(), sizeof(Conv2DGeneralBaseInfo) * base_h.size(), 6);
-  compute_encoder->setBytes(
-      base_w.data(), sizeof(Conv2DGeneralBaseInfo) * base_w.size(), 7);
+  compute_encoder.set_vector_bytes(base_h, 6);
+  compute_encoder.set_vector_bytes(base_w, 7);

  // Launch kernel
-  compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
 }

 void winograd_conv_2D_gpu(
@@ -617,18 +619,18 @@ void winograd_conv_2D_gpu(
          << bc;
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname.str());
-    compute_encoder->setComputePipelineState(kernel);
+    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(wt, 0);
    compute_encoder.set_output_array(filt_wg, 1);

-    compute_encoder->setBytes(&C_c, sizeof(int), 2);
-    compute_encoder->setBytes(&O_c, sizeof(int), 3);
+    compute_encoder.set_bytes(C_c, 2);
+    compute_encoder.set_bytes(O_c, 3);

    MTL::Size group_dims = MTL::Size(32, bo, 1);
    MTL::Size grid_dims = MTL::Size(O_c / bo, 1, 1);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Do input transform
@@ -645,18 +647,17 @@ void winograd_conv_2D_gpu(
          << bc;
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname.str());
-    compute_encoder->setComputePipelineState(kernel);
+    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(in_padded, 0);
    compute_encoder.set_output_array(inp_wg, 1);

-    compute_encoder->setBytes(
-        &conv_params_updated, sizeof(MLXConvParams<2>), 2);
+    compute_encoder.set_bytes(conv_params_updated, 2);

    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Do batched gemm
@@ -693,18 +694,17 @@ void winograd_conv_2D_gpu(
          << bc;
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname.str());
-    compute_encoder->setComputePipelineState(kernel);
+    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(out_wg, 0);
    compute_encoder.set_output_array(out, 1);

-    compute_encoder->setBytes(
-        &conv_params_updated, sizeof(MLXConvParams<2>), 2);
+    compute_encoder.set_bytes(conv_params_updated, 2);

    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

-    compute_encoder.dispatchThreadgroups(grid_dims, group_dims);
+    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }
 }

@@ -747,10 +747,6 @@ void conv_2D_gpu(
  bool is_kdil_one = conv_params.kdil[0] == 1 && conv_params.kdil[1] == 1;
  bool is_idil_one = conv_params.idil[0] == 1 && conv_params.idil[1] == 1;

-  bool inp_large = (conv_params.in_strides[0] >= 1ul << 18);
-  bool channels_large = (conv_params.C + conv_params.O) >= 512;
-  bool channels_med = (conv_params.C + conv_params.O) >= 256;
-
  if (groups > 1) {
    const int C_per_group = conv_params.C / groups;
    const int O_per_group = conv_params.O / groups;
@@ -764,10 +760,13 @@ void conv_2D_gpu(
  }

  // Direct to winograd conv
+  bool inp_large =
+      (conv_params.N * conv_params.iS[0] * conv_params.iS[1]) >= 1ul << 12;
+  bool channels_large = (conv_params.C + conv_params.O) >= 256;
  if (!flip && is_stride_one && is_kdil_one && is_idil_one &&
      conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
-      conv_params.C % 32 == 0 && conv_params.O % 32 == 0 &&
-      (channels_large || (channels_med && inp_large))) {
+      conv_params.C % 32 == 0 && conv_params.O % 32 == 0 && inp_large &&
+      channels_large) {
    return winograd_conv_2D_gpu(s, d, in, wt, out, conv_params, copies);
  }

@@ -913,12 +912,8 @@ void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
        "[Convolution::eval_gpu] Only supports 1D, 2D or 3D convolutions.");
  }

-  // Clear copies
-  if (copies.size() > 0) {
-    auto command_buffer = d.get_command_buffer(s.index);
-    command_buffer->addCompletedHandler(
-        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
-  }
+  // Record copies
+  d.add_temporaries(std::move(copies), s.index);
 }

 } // namespace mlx::core
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -10,7 +10,7 @@

 namespace mlx::core {

-constexpr int MAX_COPY_SPECIALIZED_DIMS = 5;
+constexpr int MAX_COPY_SPECIALIZED_DIMS = 3;

 void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  if (ctype == CopyType::Vector) {
@@ -43,13 +43,12 @@ void copy_gpu(const array& in, array& out, CopyType ctype) {
  copy_gpu(in, out, ctype, out.primitive().stream());
 }

-template <typename stride_t>
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& strides_in_pre,
-    const std::vector<stride_t>& strides_out_pre,
+    const Shape& data_shape,
+    const Strides& strides_in_pre,
+    const Strides& strides_out_pre,
    int64_t inp_offset,
    int64_t out_offset,
    CopyType ctype,
@@ -59,43 +58,61 @@ void copy_gpu_inplace(
  }

  // Try to collapse contiguous dims
-  auto [shape, strides] = collapse_contiguous_dims(
-      data_shape, std::vector{strides_in_pre, strides_out_pre});
-  auto& strides_in_ = strides[0];
-  auto& strides_out_ = strides[1];
-
-  bool use_2d = out.data_size() > UINT32_MAX;
-  auto& d = metal::device(s.device);
-  std::string kernel_name;
-  {
-    std::ostringstream kname;
-    switch (ctype) {
-      case CopyType::Scalar:
-        kname << (use_2d ? "s2" : "s");
-        break;
-      case CopyType::Vector:
-        kname << (use_2d ? "v2" : "v");
-        break;
-      case CopyType::General:
-        kname << "g";
-        break;
-      case CopyType::GeneralGeneral:
-        kname << "gg";
-        break;
-    }
-    if ((ctype == CopyType::General || ctype == CopyType::GeneralGeneral) &&
-        shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
-      kname << shape.size();
-    }
-    kname << "_copy";
-    kname << type_to_name(in) << type_to_name(out);
-    kernel_name = kname.str();
+  auto maybe_collapse =
+      [ctype, &data_shape, &strides_in_pre, &strides_out_pre]() {
+        if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+          auto [shape, strides] = collapse_contiguous_dims(
+              data_shape,
+              std::vector{strides_in_pre, strides_out_pre},
+              /* size_cap = */ INT32_MAX);
+          return std::make_tuple(shape, strides[0], strides[1]);
+        } else {
+          Strides e{};
+          return std::make_tuple(Shape{}, e, e);
+        }
+      };
+  auto [shape, strides_in_, strides_out_] = maybe_collapse();
+  int ndim = shape.size();
+  bool large;
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    // Allow for negative strides
+    large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
+  } else {
+    large = out.data_size() > UINT32_MAX;
  }
-
+  auto& d = metal::device(s.device);
+  int work_per_thread = 1;
+  std::string kernel_name;
+  switch (ctype) {
+    case CopyType::Scalar:
+      kernel_name = (large ? "s2" : "s");
+      break;
+    case CopyType::Vector:
+      kernel_name = (large ? "v2" : "v");
+      break;
+    case CopyType::General:
+      kernel_name = "g";
+      break;
+    case CopyType::GeneralGeneral:
+      kernel_name = "gg";
+      break;
+  }
+  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
+    if (shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
+      kernel_name += std::to_string(shape.size());
+    } else {
+      work_per_thread = large ? 4 : 2;
+      concatenate(kernel_name, "n", std::to_string(work_per_thread));
+    }
+    if (large) {
+      kernel_name += "large";
+    }
+  }
+  concatenate(kernel_name, "_copy", type_to_name(in), type_to_name(out));
  auto kernel = get_copy_kernel(d, kernel_name, in, out);

  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);
  bool donate_in = in.data_shared_ptr() == nullptr;

  inp_offset *= size_of(in.dtype());
@@ -104,50 +121,48 @@ void copy_gpu_inplace(
  compute_encoder.set_input_array(donate_in ? out : in, 0, inp_offset);
  compute_encoder.set_output_array(out, 1, out_offset);

+  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
-    int ndim = shape.size();
-    std::vector<int64_t> strides_in{strides_in_.begin(), strides_in_.end()};
-    std::vector<int64_t> strides_out{strides_out_.begin(), strides_out_.end()};
-
+    Strides strides_in{strides_in_.begin(), strides_in_.end()};
+    Strides strides_out{strides_out_.begin(), strides_out_.end()};
    if (ndim > 3) {
-      set_vector_bytes(compute_encoder, shape, ndim, 2);
+      compute_encoder.set_vector_bytes(shape, ndim, 2);
    }
-    set_vector_bytes(compute_encoder, strides_in, ndim, 3);
+    compute_encoder.set_vector_bytes(strides_in, ndim, 3);
    if (ctype == CopyType::GeneralGeneral) {
-      set_vector_bytes(compute_encoder, strides_out, ndim, 4);
+      compute_encoder.set_vector_bytes(strides_out, ndim, 4);
    }

-    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
-      compute_encoder->setBytes(&ndim, sizeof(int), 5);
-    }
-
-    int dim0 = ndim > 0 ? shape[ndim - 1] : 1;
-    int dim1 = ndim > 1 ? shape[ndim - 2] : 1;
+    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
+    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;

    size_t data_size = 1;
    for (auto& s : shape)
      data_size *= s;
-    int rest = data_size / (dim0 * dim1);
+    size_t rest = data_size / (dim0 * dim1);
+
+    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
+      compute_encoder.set_bytes(ndim, 5);
+      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
+    }

    // NB assuming thread_group_size is a power of 2 larger than 32 x 32
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::copy] Must use 1024 sized block");
    }

    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t nthreads = out.data_size();
-    MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
-                                 : MTL::Size(nthreads, 1, 1);
-    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+    MTL::Size grid_dims = large ? get_2d_grid_dims(out.shape(), out.strides())
+                                : MTL::Size(nthreads, 1, 1);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
 }

@@ -164,14 +179,13 @@ void copy_gpu_inplace(
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int64_t>& istride,
+    const Strides& istride,
    int64_t ioffset,
    CopyType ctype,
    const Stream& s) {
  assert(in.shape() == out.shape());
-  std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
  return copy_gpu_inplace(
-      in, out, in.shape(), istride, ostrides, ioffset, 0, ctype, s);
+      in, out, in.shape(), istride, out.strides(), ioffset, 0, ctype, s);
 }

 void fill_gpu(const array& val, array& out, const Stream& s) {
@@ -179,26 +193,26 @@ void fill_gpu(const array& val, array& out, const Stream& s) {
    return;
  }
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  bool use_2d = out.data_size() > UINT32_MAX;
+  bool large = out.data_size() > UINT32_MAX;
  auto& d = metal::device(s.device);
-  std::string kernel_name = std::string(use_2d ? "s2" : "s") + "_copy" +
+  std::string kernel_name = std::string(large ? "s2" : "s") + "_copy" +
      type_to_name(val) + type_to_name(out);
  auto kernel = get_copy_kernel(d, kernel_name, val, out);
  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(val, 0);
  compute_encoder.set_output_array(out, 1);

+  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  size_t nthreads = out.data_size();
-  MTL::Size grid_dims = use_2d ? get_2d_grid_dims(out.shape(), out.strides())
-                               : MTL::Size(nthreads, 1, 1);
-  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (thread_group_size > nthreads) {
    thread_group_size = nthreads;
  }
  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  MTL::Size grid_dims = large ? get_2d_grid_dims(out.shape(), out.strides())
+                              : MTL::Size(nthreads, 1, 1);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }

 } // namespace mlx::core
--- a/mlx/backend/metal/copy.h
+++ b/mlx/backend/metal/copy.h
@@ -8,13 +8,12 @@
 namespace mlx::core {

 // Generic copy inplace
-template <typename stride_t>
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype,
@@ -32,7 +31,7 @@ void copy_gpu_inplace(
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int64_t>& istride,
+    const Strides& istride,
    int64_t ioffset,
    CopyType ctype,
    const Stream& s);
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -32,57 +32,53 @@ void CustomKernel::eval_gpu(
      return copies.back();
    }
  };
-  std::vector<const array> checked_inputs;
+  std::vector<array> checked_inputs;
  for (const array& in : inputs) {
    checked_inputs.push_back(check_input(in));
  }

  auto& d = metal::device(s.device);
  const auto& lib_name = name_;
-  auto lib = d.get_library(lib_name);
-  if (lib == nullptr) {
-    lib = d.get_library(lib_name, metal::utils() + source_);
-  }
+  auto lib =
+      d.get_library(lib_name, [this] { return metal::utils() + source_; });
  auto kernel = d.get_kernel(name_, lib);
  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);
  int index = 0;
  for (int i = 0; i < checked_inputs.size(); i++) {
    const array& in = checked_inputs[i];
-    auto shape_info = shape_infos_[i];
+    auto& shape_info = shape_infos_[i];
    compute_encoder.set_input_array(in, index);
    index++;
    if (in.ndim() > 0) {
      int ndim = in.ndim();
      if (shape_info.shape) {
-        set_vector_bytes(compute_encoder, in.shape(), ndim, index);
+        compute_encoder.set_vector_bytes(in.shape(), ndim, index);
        index++;
      }
      if (shape_info.strides) {
-        set_vector_bytes(compute_encoder, in.strides(), ndim, index);
+        compute_encoder.set_vector_bytes(in.strides(), ndim, index);
        index++;
      }
      if (shape_info.ndim) {
-        compute_encoder->setBytes(&ndim, sizeof(int), index);
+        compute_encoder.set_bytes(ndim, index);
        index++;
      }
    }
  }
-  for (array out : outputs) {
+  for (auto& out : outputs) {
    compute_encoder.set_output_array(out, index);
    index++;
  }

  const auto [tx, ty, tz] = threadgroup_;
-  MTL::Size group_dims = MTL::Size(tx, ty, tz);
  const auto [gx, gy, gz] = grid_;
+  MTL::Size group_dims =
+      MTL::Size(std::min(tx, gx), std::min(ty, gy), std::min(tz, gz));
  MTL::Size grid_dims = MTL::Size(gx, gy, gz);
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);

-  if (!copies.empty()) {
-    d.get_command_buffer(s.index)->addCompletedHandler(
-        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
-  }
+  d.add_temporaries(std::move(copies), s.index);
 }

 } // namespace mlx::core::fast
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -20,18 +20,21 @@ namespace {

 // TODO nicer way to set this or possibly expose as an environment variable
 constexpr int MAX_BUFFERS_PER_QUEUE = 12;
-constexpr int MAX_DISPATCHES_PER_ENCODER = 2;

 constexpr const char* default_mtllib_path = METAL_PATH;

-constexpr auto get_metal_version() {
-#if (MLX_METAL_VERSION >= 320)
-  return MTL::LanguageVersion3_2;
-#elif (MLX_METAL_VERSION >= 310)
-  return MTL::LanguageVersion3_1;
-#else
-  return MTL::LanguageVersion3_0;
-#endif
+auto get_metal_version() {
+  auto get_metal_version_ = []() {
+    if (__builtin_available(macOS 15, iOS 18, tvOS 18, visionOS 2, *)) {
+      return MTL::LanguageVersion3_2;
+    } else if (__builtin_available(macOS 14, iOS 17, tvOS 17, visionOS 1, *)) {
+      return MTL::LanguageVersion3_1;
+    } else {
+      return MTL::LanguageVersion3_0;
+    }
+  };
+  static auto metal_version_ = get_metal_version_();
+  return metal_version_;
 }

 auto load_device() {
@@ -121,33 +124,29 @@ MTL::Library* load_library(

 } // namespace

-CommandEncoder::CommandEncoder(MTL::CommandBuffer* cbuf) : cbuf(cbuf) {
-  enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
-  enc->retain();
+CommandEncoder::CommandEncoder(MTL::CommandBuffer* cbuf) {
+  enc_ = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
+  enc_->retain();
 }

 CommandEncoder::~CommandEncoder() {
-  enc->endEncoding();
-  enc->release();
+  enc_->endEncoding();
+  enc_->release();
 }

 void CommandEncoder::set_input_array(
    const array& a,
    int idx,
    int64_t offset /* = 0 */) {
+  all_inputs_.insert(a.buffer().ptr());
  auto r_buf = static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
-  if (auto it = outputs.find(r_buf); it != outputs.end()) {
-    // Insert a barrier
-    enc->memoryBarrier(&r_buf, 1);
-
-    // Remove the output
-    outputs.erase(it);
-  }
+  needs_barrier_ =
+      needs_barrier_ | (prev_outputs_.find(r_buf) != prev_outputs_.end());
  auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
  auto base_offset = a.data<char>() -
      static_cast<char*>(const_cast<MTL::Buffer*>(a_buf)->contents());
  base_offset += offset;
-  enc->setBuffer(a_buf, base_offset, idx);
+  enc_->setBuffer(a_buf, base_offset, idx);
 }

 void CommandEncoder::set_output_array(
@@ -156,55 +155,49 @@ void CommandEncoder::set_output_array(
    int64_t offset /* = 0 */) {
  // Add barriers before adding the output to the output set
  set_input_array(a, idx, offset);
+  all_outputs_.insert(a.buffer().ptr());
  auto buf = static_cast<MTL::Resource*>(a.buffer().ptr());
-  if (concurrent) {
-    concurrent_outputs.insert(buf);
+  if (concurrent_) {
+    concurrent_outputs_.insert(buf);
  } else {
-    outputs.insert(buf);
+    next_outputs_.insert(buf);
  }
 }

-void CommandEncoder::dispatchThreadgroups(
-    MTL::Size grid_dims,
-    MTL::Size group_dims) {
-  num_dispatches++;
-  enc->dispatchThreadgroups(grid_dims, group_dims);
-  maybe_split();
-}
-
-void CommandEncoder::dispatchThreads(
-    MTL::Size grid_dims,
-    MTL::Size group_dims) {
-  num_dispatches++;
-  enc->dispatchThreads(grid_dims, group_dims);
-  maybe_split();
-}
-
-void CommandEncoder::maybe_split() {
-  if (num_dispatches > MAX_DISPATCHES_PER_ENCODER && !concurrent) {
-    enc->endEncoding();
-    enc->release();
-    num_dispatches = 0;
-    outputs.clear();
-    enc = cbuf->computeCommandEncoder(MTL::DispatchTypeConcurrent);
-    enc->retain();
+void CommandEncoder::maybeInsertBarrier() {
+  if (needs_barrier_) {
+    enc_->memoryBarrier(MTL::BarrierScopeBuffers);
+    needs_barrier_ = false;
+    prev_outputs_ = std::move(next_outputs_);
+  } else {
+    prev_outputs_.insert(next_outputs_.begin(), next_outputs_.end());
  }
+  next_outputs_.clear();
+}
+
+void CommandEncoder::dispatch_threadgroups(
+    MTL::Size grid_dims,
+    MTL::Size group_dims) {
+  maybeInsertBarrier();
+  enc_->dispatchThreadgroups(grid_dims, group_dims);
+}
+
+void CommandEncoder::dispatch_threads(
+    MTL::Size grid_dims,
+    MTL::Size group_dims) {
+  maybeInsertBarrier();
+  enc_->dispatchThreads(grid_dims, group_dims);
 }

 Device::Device() {
  auto pool = new_scoped_memory_pool();
  device_ = load_device();
  library_map_ = {{"mlx", load_library(device_)}};
+  arch_ = std::string(device_->architecture()->name()->utf8String());
 }

 Device::~Device() {
  auto pool = new_scoped_memory_pool();
-  for (auto& q : queue_map_) {
-    q.second->release();
-  }
-  for (auto& b : buffer_map_) {
-    b.second.second->release();
-  }
  for (auto& k : kernel_map_) {
    k.second->release();
  }
@@ -219,69 +212,134 @@ void Device::new_queue(int index) {

  // Multiple threads can ask the device for queues
  // We lock this as a critical section for safety
-  const std::lock_guard<std::mutex> lock(mtx_);
  auto q = device_->newCommandQueue(MAX_BUFFERS_PER_QUEUE);
  debug_set_stream_queue_label(q, index);
  if (!q) {
    throw std::runtime_error(
        "[metal::Device] Failed to make new command queue.");
  }
-  queue_map_.insert({index, q});
+  stream_map_.emplace(index, q);
+  if (residency_set_ != nullptr) {
+    q->addResidencySet(residency_set_);
+  }
 }

 int Device::get_command_buffer_ops(int index) {
-  auto bit = buffer_map_.find(index);
-  return bit->second.first;
+  return get_stream_(index).buffer_ops;
 }

 void Device::increment_command_buffer_ops(int index) {
-  auto bit = buffer_map_.find(index);
-  bit->second.first++;
+  get_stream_(index).buffer_ops++;
 }

 MTL::CommandBuffer* Device::get_command_buffer(int index) {
-  auto bit = buffer_map_.find(index);
-  if (bit == buffer_map_.end()) {
-    auto qit = queue_map_.find(index);
-    if (qit == queue_map_.end()) {
-      throw std::runtime_error(
-          "[metal::Device] Attempting to get command buffer for invalid queue.");
-    }
-
-    auto cb = qit->second->commandBufferWithUnretainedReferences();
-
-    if (!cb) {
+  auto& stream = get_stream_(index);
+  if (stream.buffer == nullptr) {
+    stream.buffer = stream.queue->commandBufferWithUnretainedReferences();
+    if (!stream.buffer) {
      throw std::runtime_error(
          "[metal::Device] Unable to create new command buffer");
    }
-
    // Increment ref count so the buffer is not garbage collected
-    cb->retain();
-
-    bit = buffer_map_.insert({index, {0, cb}}).first;
+    stream.buffer->retain();
  }
-  return bit->second.second;
+  return stream.buffer;
 }

 void Device::commit_command_buffer(int index) {
-  auto bit = buffer_map_.find(index);
-  bit->second.second->commit();
-  bit->second.second->release();
-  buffer_map_.erase(bit);
+  auto& stream = get_stream_(index);
+  stream.buffer->commit();
+  stream.buffer->release();
+  stream.buffer = nullptr;
+  stream.buffer_ops = 0;
+}
+
+void Device::add_temporary(array arr, int index) {
+  get_stream_(index).temporaries.push_back(std::move(arr));
+}
+
+void Device::add_temporaries(std::vector<array> arrays, int index) {
+  if (arrays.empty()) {
+    return;
+  }
+  auto& stream = get_stream_(index);
+  stream.temporaries.insert(
+      stream.temporaries.end(),
+      std::make_move_iterator(arrays.begin()),
+      std::make_move_iterator(arrays.end()));
 }

 void Device::end_encoding(int index) {
-  encoder_map_.erase(index);
+  auto& stream = get_stream_(index);
+  if (stream.encoder != nullptr) {
+    // Each command encoder has a unique fence. We also store a map of
+    // all previous outputs of command encoders to their corresponding fence.
+    // - The command encoder records its inputs and outputs.
+    // - Wait on a fence if any inputs in the encoder are outputs of a previous
+    //   encoder.
+    // - Update the map of outputs to include this command encoder's outputs.
+    // - Always signal this command encoders fence.
+    // - Add a completion handler for this command encoder that removes outputs
+    //   from the map to limit the growth of the map and avoid unnecessary waits
+    // - Temporaries are a special case as they do not cross command encoder
+    //   boundaries. These can be removed early from the encoders inputs and
+    //   outputs since they don't need synchronization.
+    auto& enc = *stream.encoder;
+    // Remove temporaries from inputs and outputs
+    for (auto& t : stream.temporaries) {
+      if (t.data<void>() != nullptr) {
+        enc.outputs().erase(t.buffer().ptr());
+        enc.inputs().erase(t.buffer().ptr());
+      }
+    }
+
+    // Keep references to the fences we waited on and put them
+    // in the completion handler so they are not prematurely released
+    std::unordered_set<std::shared_ptr<Fence>> waiting_on;
+    {
+      std::lock_guard<std::mutex> lk(stream.fence_mtx);
+      for (auto in : enc.inputs()) {
+        if (auto it = stream.outputs.find(in); it != stream.outputs.end()) {
+          // If we've already waited on a fence, don't wait on it again.
+          if (waiting_on.find(it->second) == waiting_on.end()) {
+            enc.wait_for_fence(it->second->fence);
+            waiting_on.insert(it->second);
+          }
+        }
+      }
+      for (auto out : enc.outputs()) {
+        stream.outputs[out] = stream.fence;
+      }
+    }
+    enc.update_fence(stream.fence->fence);
+    stream.buffer->addCompletedHandler(
+        [&stream,
+         waiting_on = std::move(waiting_on),
+         fence = std::move(stream.fence),
+         outputs = std::move(enc.outputs()),
+         temporaries =
+             std::move(stream.temporaries)](MTL::CommandBuffer*) mutable {
+          temporaries.clear();
+          std::lock_guard<std::mutex> lk(stream.fence_mtx);
+          for (auto o : outputs) {
+            if (auto it = stream.outputs.find(o); it != stream.outputs.end()) {
+              if (it->second == fence) {
+                stream.outputs.erase(it);
+              }
+            }
+          }
+        });
+  }
+  stream.encoder = nullptr;
 }

 CommandEncoder& Device::get_command_encoder(int index) {
-  auto eit = encoder_map_.find(index);
-  if (eit == encoder_map_.end()) {
-    auto cb = get_command_buffer(index);
-    eit =
-        encoder_map_.emplace(index, std::make_unique<CommandEncoder>(cb)).first;
+  auto& stream = get_stream_(index);
+  if (stream.encoder == nullptr) {
+    stream.encoder = std::make_unique<CommandEncoder>(stream.buffer);
+    stream.fence = std::make_shared<Fence>(device_->newFence());
  }
-  return *(eit->second);
+  return *stream.encoder;
 }

 void Device::register_library(
@@ -293,20 +351,7 @@ void Device::register_library(
  }
 }

-MTL::Library* Device::get_library_cache_(const std::string& lib_name) {
-  // Search for cached metal lib
-  MTL::Library* mtl_lib;
-  if (auto it = library_map_.find(lib_name); it != library_map_.end()) {
-    mtl_lib = it->second;
-  } else { // Look for metallib alongside library
-    register_library(lib_name, get_colocated_mtllib_path(lib_name));
-    mtl_lib = library_map_[lib_name];
-  }
-
-  return mtl_lib;
-}
-
-MTL::Library* Device::get_library_(const std::string& source_string) {
+MTL::Library* Device::build_library_(const std::string& source_string) {
  auto pool = new_scoped_memory_pool();

  auto ns_code =
@@ -322,26 +367,7 @@ MTL::Library* Device::get_library_(const std::string& source_string) {
  // Throw error if unable to compile library
  if (!mtl_lib) {
    std::ostringstream msg;
-    msg << "[metal::Device] Unable to build metal library from source" << "\n";
-    if (error) {
-      msg << error->localizedDescription()->utf8String() << "\n";
-    }
-    throw std::runtime_error(msg.str());
-  }
-
-  return mtl_lib;
-}
-
-MTL::Library* Device::get_library_(const MTL::StitchedLibraryDescriptor* desc) {
-  auto pool = new_scoped_memory_pool();
-
-  NS::Error* error = nullptr;
-  auto mtl_lib = device_->newLibrary(desc, &error);
-
-  // Throw error if unable to compile library
-  if (!mtl_lib) {
-    std::ostringstream msg;
-    msg << "[metal::Device] Unable to build stitched metal library" << "\n";
+    msg << "[metal::Device] Unable to build metal library from source\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
@@ -465,68 +491,32 @@ MTL::ComputePipelineState* Device::get_kernel_(
  return kernel;
 }

-MTL::Library* Device::get_library(const std::string& name) {
+MTL::Library* Device::get_library_(const std::string& name) {
+  std::shared_lock lock(library_mtx_);
  auto it = library_map_.find(name);
  return (it != library_map_.end()) ? it->second : nullptr;
 }

 MTL::Library* Device::get_library(
    const std::string& name,
-    const std::string& source,
-    bool cache /* = true */) {
-  if (cache) {
+    const std::function<std::string(void)>& builder) {
+  {
+    std::shared_lock rlock(library_mtx_);
    if (auto it = library_map_.find(name); it != library_map_.end()) {
      return it->second;
    }
  }

-  auto mtl_lib = get_library_(source);
-
-  if (cache) {
-    library_map_.insert({name, mtl_lib});
+  std::unique_lock wlock(library_mtx_);
+  if (auto it = library_map_.find(name); it != library_map_.end()) {
+    return it->second;
  }

+  auto mtl_lib = build_library_(builder());
+  library_map_.insert({name, mtl_lib});
  return mtl_lib;
 }

-MTL::Library* Device::get_library(
-    const std::string& name,
-    const MTL::StitchedLibraryDescriptor* desc,
-    bool cache /* = true */) {
-  if (cache) {
-    if (auto it = library_map_.find(name); it != library_map_.end()) {
-      return it->second;
-    }
-  }
-
-  auto mtl_lib = get_library_(desc);
-
-  if (cache) {
-    library_map_.insert({name, mtl_lib});
-  }
-
-  return mtl_lib;
-}
-
-MTL::Function* Device::get_function(
-    const std::string& base_name,
-    MTL::Library* mtl_lib,
-    const std::string& specialized_name /* = "" */,
-    const MTLFCList& func_consts /* = {} */) {
-  return get_function_(base_name, specialized_name, func_consts, mtl_lib);
-}
-
-MTL::Function* Device::get_function(
-    const std::string& base_name,
-    const std::string& lib_name /* = "mlx" */,
-    const std::string& specialized_name /*  = "" */,
-    const MTLFCList& func_consts /* = {} */) {
-  // Search for cached metal lib
-  MTL::Library* mtl_lib = get_library_cache_(lib_name);
-
-  return get_function(base_name, mtl_lib, specialized_name, func_consts);
-}
-
 MTL::LinkedFunctions* Device::get_linked_functions_(
    const std::vector<MTL::Function*>& funcs) {
  if (funcs.empty()) {
@@ -547,34 +537,55 @@ MTL::LinkedFunctions* Device::get_linked_functions_(
  return lfuncs;
 }

+MTL::ComputePipelineState* Device::get_kernel_(
+    const std::string& base_name,
+    MTL::Library* mtl_lib,
+    const std::string& hash_name,
+    const MTLFCList& func_consts /* = {} */,
+    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
+  // Single writer allowed
+  std::unique_lock wlock(kernel_mtx_);
+
+  // Try loading again to avoid loading twice
+  if (auto it = kernel_map_.find(hash_name); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  auto pool = new_scoped_memory_pool();
+
+  // Pull kernel from library
+  auto mtl_function = get_function_(base_name, hash_name, func_consts, mtl_lib);
+
+  // Compile kernel to compute pipeline
+  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
+  auto kernel = get_kernel_(hash_name, mtl_function, mtl_linked_funcs);
+
+  mtl_function->release();
+  mtl_linked_funcs->release();
+
+  // Add kernel to cache
+  auto inserted = kernel_map_.insert({hash_name, kernel});
+
+  return kernel;
+}
+
 MTL::ComputePipelineState* Device::get_kernel(
    const std::string& base_name,
    MTL::Library* mtl_lib,
    const std::string& hash_name /* = "" */,
    const MTLFCList& func_consts /* = {} */,
    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
-  auto pool = new_scoped_memory_pool();
-
-  // Look for cached kernel
  const auto& kname = hash_name.empty() ? base_name : hash_name;
-  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
-    return it->second;
+  {
+    // Multiple readers allowed
+    std::shared_lock lock(kernel_mtx_);
+
+    // Look for cached kernel
+    if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+      return it->second;
+    }
  }
-
-  // Pull kernel from library
-  auto mtl_function = get_function_(base_name, kname, func_consts, mtl_lib);
-
-  // Compile kernel to compute pipeline
-  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
-  auto kernel = get_kernel_(kname, mtl_function, mtl_linked_funcs);
-
-  mtl_function->release();
-  mtl_linked_funcs->release();
-
-  // Add kernel to cache
-  kernel_map_.insert({kname, kernel});
-
-  return kernel;
+  return get_kernel_(base_name, mtl_lib, kname, func_consts, linked_functions);
 }

 MTL::ComputePipelineState* Device::get_kernel(
@@ -583,16 +594,34 @@ MTL::ComputePipelineState* Device::get_kernel(
    const std::string& hash_name /*  = "" */,
    const MTLFCList& func_consts /*  = {} */,
    const std::vector<MTL::Function*>& linked_functions /*  = {} */) {
-  // Look for cached kernel
  const auto& kname = hash_name.size() == 0 ? base_name : hash_name;
-  if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
-    return it->second;
+  {
+    // Multiple readers allowed
+    std::shared_lock lock(kernel_mtx_);
+
+    // Look for cached kernel
+    if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
+      return it->second;
+    }
  }
-
  // Search for cached metal lib
-  MTL::Library* mtl_lib = get_library_cache_(lib_name);
+  MTL::Library* mtl_lib = get_library_(lib_name);
+  return get_kernel_(base_name, mtl_lib, kname, func_consts, linked_functions);
+}

-  return get_kernel(base_name, mtl_lib, kname, func_consts, linked_functions);
+void Device::set_residency_set(const MTL::ResidencySet* residency_set) {
+  if (residency_set_ != nullptr) {
+    throw std::runtime_error(
+        "[Device::set_residency_set] Can only be set once.");
+  }
+  if (residency_set == nullptr) {
+    return;
+  }
+  residency_set_ = residency_set;
+  // Attach residency set to existing command queues
+  for (auto& [_, stream] : stream_map_) {
+    stream.queue->addResidencySet(residency_set_);
+  }
 }

 Device& device(mlx::core::Device) {
@@ -616,21 +645,27 @@ void new_stream(Stream stream) {

 std::unordered_map<std::string, std::variant<std::string, size_t>>
 device_info() {
-  auto raw_device = device(default_device()).mtl_device();
-  auto arch = std::string(raw_device->architecture()->name()->utf8String());
+  auto init_device_info = []()
+      -> std::unordered_map<std::string, std::variant<std::string, size_t>> {
+    auto pool = new_scoped_memory_pool();
+    auto raw_device = device(default_device()).mtl_device();
+    auto arch = std::string(raw_device->architecture()->name()->utf8String());

-  int mib[] = {CTL_HW, HW_MEMSIZE};
-  size_t memsize = 0;
-  size_t length = sizeof(memsize);
+    int mib[] = {CTL_HW, HW_MEMSIZE};
+    size_t memsize = 0;
+    size_t length = sizeof(memsize);

-  sysctl(mib, 2, &memsize, &length, NULL, 0);
+    sysctl(mib, 2, &memsize, &length, NULL, 0);

-  return {
-      {"architecture", arch},
-      {"max_buffer_length", raw_device->maxBufferLength()},
-      {"max_recommended_working_set_size",
-       raw_device->recommendedMaxWorkingSetSize()},
-      {"memory_size", memsize}};
+    return {
+        {"architecture", arch},
+        {"max_buffer_length", raw_device->maxBufferLength()},
+        {"max_recommended_working_set_size",
+         raw_device->recommendedMaxWorkingSetSize()},
+        {"memory_size", memsize}};
+  };
+  static auto device_info_ = init_device_info();
+  return device_info_;
 }

 } // namespace mlx::core::metal
--- a/mlx/backend/metal/device.h
+++ b/mlx/backend/metal/device.h
@@ -7,6 +7,7 @@
 #include <filesystem>
 #include <functional>
 #include <mutex>
+#include <shared_mutex>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -44,43 +45,114 @@ struct CommandEncoder {

  struct ConcurrentContext {
    ConcurrentContext(CommandEncoder& enc) : enc(enc) {
-      enc.concurrent = true;
+      enc.concurrent_ = true;
    }
    ~ConcurrentContext() {
-      enc.concurrent = false;
-      enc.outputs.insert(
-          enc.concurrent_outputs.begin(), enc.concurrent_outputs.end());
-      enc.concurrent_outputs.clear();
+      enc.concurrent_ = false;
+      enc.prev_outputs_.insert(
+          enc.concurrent_outputs_.begin(), enc.concurrent_outputs_.end());
+      enc.concurrent_outputs_.clear();
    }

   private:
    CommandEncoder& enc;
  };

-  MTL::ComputeCommandEncoder* operator->() {
-    return enc;
-  }
-
  void set_input_array(const array& a, int idx, int64_t offset = 0);
  void set_output_array(array& a, int idx, int64_t offset = 0);
-  void dispatchThreadgroups(MTL::Size grid_dims, MTL::Size group_dims);
-  void dispatchThreads(MTL::Size grid_dims, MTL::Size group_dims);
+  void dispatch_threadgroups(MTL::Size grid_dims, MTL::Size group_dims);
+  void dispatch_threads(MTL::Size grid_dims, MTL::Size group_dims);
+  void maybeInsertBarrier();
+
+  void set_compute_pipeline_state(MTL::ComputePipelineState* kernel) {
+    enc_->setComputePipelineState(kernel);
+  }
+
+  void wait_for_fence(MTL::Fence* fence) {
+    enc_->waitForFence(fence);
+  }
+
+  void update_fence(MTL::Fence* fence) {
+    enc_->updateFence(fence);
+  }
+
+  template <typename T>
+  void set_vector_bytes(const std::vector<T>& vec, size_t nelems, int idx) {
+    enc_->setBytes(vec.data(), nelems * sizeof(T), idx);
+  }
+  template <typename T>
+  void set_vector_bytes(const std::vector<T>& vec, int idx) {
+    return set_vector_bytes(vec, vec.size(), idx);
+  }
+
+  template <typename T>
+  void set_bytes(const T* v, int n, int idx) {
+    return enc_->setBytes(v, n * sizeof(T), idx);
+  }
+
+  template <typename T>
+  void set_bytes(const T& v, int idx) {
+    return enc_->setBytes(&v, sizeof(T), idx);
+  }

  ConcurrentContext start_concurrent() {
    return ConcurrentContext(*this);
  }
-
  ~CommandEncoder();

- private:
-  void maybe_split();
+  // Inputs to all kernels in the encoder including temporaries
+  std::unordered_set<const void*>& inputs() {
+    return all_inputs_;
+  };

-  int num_dispatches{0};
-  MTL::CommandBuffer* cbuf;
-  MTL::ComputeCommandEncoder* enc;
-  bool concurrent{false};
-  std::unordered_set<MTL::Resource*> outputs;
-  std::unordered_set<MTL::Resource*> concurrent_outputs;
+  // Outputs of all kernels in the encoder including temporaries
+  std::unordered_set<const void*> outputs() {
+    return all_outputs_;
+  };
+
+ private:
+  MTL::ComputeCommandEncoder* enc_;
+  bool needs_barrier_{false};
+  bool concurrent_{false};
+  std::unordered_set<MTL::Resource*> prev_outputs_;
+  std::unordered_set<MTL::Resource*> next_outputs_;
+  std::unordered_set<MTL::Resource*> concurrent_outputs_;
+  std::unordered_set<const void*> all_inputs_;
+  std::unordered_set<const void*> all_outputs_;
+};
+
+struct Fence {
+  Fence(MTL::Fence* fence) : fence(fence) {}
+  ~Fence() {
+    fence->release();
+  }
+  MTL::Fence* fence;
+};
+
+struct DeviceStream {
+  DeviceStream(MTL::CommandQueue* queue) : queue(queue) {};
+  ~DeviceStream() {
+    queue->release();
+    if (buffer != nullptr) {
+      buffer->release();
+    }
+  };
+  MTL::CommandQueue* queue;
+  // A map of prior command encoder outputs to their corresponding fence
+  std::unordered_map<const void*, std::shared_ptr<Fence>> outputs;
+  // Used to allow thread-safe access to the outputs map
+  std::mutex fence_mtx;
+
+  // The buffer and buffer op count are updated
+  // between command buffers
+  MTL::CommandBuffer* buffer{nullptr};
+  int buffer_ops{0};
+
+  // The command encoder, fence, and temporaries are updated between command
+  // encoders
+  std::unique_ptr<CommandEncoder> encoder{nullptr};
+  std::shared_ptr<Fence> fence;
+  std::vector<array> temporaries;
 };

 class Device {
@@ -94,6 +166,10 @@ class Device {
    return device_;
  };

+  const std::string& get_architecture() {
+    return arch_;
+  }
+
  void new_queue(int index);
  MTL::CommandBuffer* get_command_buffer(int index);
  int get_command_buffer_ops(int index);
@@ -114,29 +190,9 @@ class Device {
    }
  }

-  MTL::Library* get_library(const std::string& name);
-
  MTL::Library* get_library(
      const std::string& name,
-      const std::string& source_string,
-      bool cache = true);
-
-  MTL::Library* get_library(
-      const std::string& name,
-      const MTL::StitchedLibraryDescriptor* desc,
-      bool cache = true);
-
-  MTL::Function* get_function(
-      const std::string& base_name,
-      MTL::Library* mtl_lib,
-      const std::string& specialized_name = "",
-      const MTLFCList& func_consts = {});
-
-  MTL::Function* get_function(
-      const std::string& base_name,
-      const std::string& lib_name = "mlx",
-      const std::string& specialized_name = "",
-      const MTLFCList& func_consts = {});
+      const std::function<std::string(void)>& builder);

  MTL::ComputePipelineState* get_kernel(
      const std::string& base_name,
@@ -155,11 +211,20 @@ class Device {
  MTL::ArgumentEncoder* argument_encoder(
      const std::vector<MTL::ArgumentDescriptor*>& arg_descs) const;

+  // Record temporary arrays for the given stream index
+  void add_temporary(array arr, int index);
+  void add_temporaries(std::vector<array> arrays, int index);
+
+  void set_residency_set(const MTL::ResidencySet* residency_set);
+
 private:
+  DeviceStream& get_stream_(int index) {
+    return stream_map_.find(index)->second;
+  }
  MTL::Library* get_library_cache_(const std::string& name);

-  MTL::Library* get_library_(const std::string& source_string);
-  MTL::Library* get_library_(const MTL::StitchedLibraryDescriptor* desc);
+  MTL::Library* get_library_(const std::string& name);
+  MTL::Library* build_library_(const std::string& source_string);

  MTL::Function* get_function_(const std::string& name, MTL::Library* mtl_lib);

@@ -181,13 +246,23 @@ class Device {
      const MTL::Function* mtl_function,
      const MTL::LinkedFunctions* linked_functions);

+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& hash_name,
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+
  MTL::Device* device_;
-  std::unordered_map<int32_t, MTL::CommandQueue*> queue_map_;
-  std::unordered_map<int32_t, std::pair<int, MTL::CommandBuffer*>> buffer_map_;
-  std::unordered_map<int32_t, std::unique_ptr<CommandEncoder>> encoder_map_;
+  std::unordered_map<int32_t, DeviceStream> stream_map_;
+
+  std::shared_mutex kernel_mtx_;
  std::unordered_map<std::string, MTL::ComputePipelineState*> kernel_map_;
+
+  std::shared_mutex library_mtx_;
  std::unordered_map<std::string, MTL::Library*> library_map_;
-  std::mutex mtx_;
+  const MTL::ResidencySet* residency_set_{nullptr};
+  std::string arch_;
 };

 Device& device(mlx::core::Device);
--- a/mlx/backend/metal/event.cpp
+++ b/mlx/backend/metal/event.cpp
@@ -27,4 +27,9 @@ void Event::signal() {
  static_cast<MTL::SharedEvent*>(raw_event().get())->setSignaledValue(value());
 }

+bool Event::is_signaled() const {
+  return static_cast<MTL::SharedEvent*>(raw_event().get())->signaledValue() >=
+      value();
+}
+
 } // namespace mlx::core
--- a/mlx/backend/metal/fft.cpp
+++ b/mlx/backend/metal/fft.cpp
@@ -363,7 +363,7 @@ void multi_upload_bluestein_fft(
  auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);

  // Broadcast w_q and w_k to the batch size
-  std::vector<size_t> b_strides(in.ndim(), 0);
+  Strides b_strides(in.ndim(), 0);
  b_strides[axis] = 1;
  array w_k_broadcast({}, complex64, nullptr, {});
  array w_q_broadcast({}, complex64, nullptr, {});
@@ -386,8 +386,8 @@ void multi_upload_bluestein_fft(
    copies.push_back(slice_temp);
    copies.push_back(conj_temp);

-    std::vector<int> rstarts(in.ndim(), 0);
-    std::vector<int> rstrides(in.ndim(), 1);
+    Shape rstarts(in.ndim(), 0);
+    Shape rstrides(in.ndim(), 1);
    rstarts[axis] = in.shape(axis) - back_offset;
    rstrides[axis] = -1;
    unary_op_gpu({in}, conj_temp, "Conjugate", s);
@@ -431,19 +431,19 @@ void multi_upload_bluestein_fft(
      s);

  int offset = plan.bluestein_n - (2 * n - 1);
-  std::vector<int> starts(in.ndim(), 0);
-  std::vector<int> strides(in.ndim(), 1);
+  Shape starts(in.ndim(), 0);
+  Shape strides(in.ndim(), 1);
  starts[axis] = plan.bluestein_n - offset - n;
  slice_gpu(pad_temp1, temp, starts, strides, s);

  binary_op_gpu_inplace({temp, w_k_broadcast}, temp1, "Multiply", s);

  if (real && !inverse) {
-    std::vector<int> rstarts(in.ndim(), 0);
-    std::vector<int> rstrides(in.ndim(), 1);
+    Shape rstarts(in.ndim(), 0);
+    Shape rstrides(in.ndim(), 1);
    slice_gpu(temp1, out, rstarts, strides, s);
  } else if (real && inverse) {
-    std::vector<size_t> b_strides(in.ndim(), 0);
+    Strides b_strides(in.ndim(), 0);
    auto inv_n = array({1.0f / n}, {1}, float32);
    array temp_float(out.shape(), out.dtype(), nullptr, {});
    copies.push_back(temp_float);
@@ -531,8 +531,8 @@ void fft_op(
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
-      std::vector<size_t> strides;
-      size_t cur_stride = x.shape(axis);
+      Strides strides;
+      int64_t cur_stride = x.shape(axis);
      for (int a = 0; a < x.ndim(); a++) {
        if (a == axis) {
          strides.push_back(1);
@@ -575,8 +575,7 @@ void fft_op(
  auto plan = plan_fft(n);
  if (plan.four_step) {
    four_step_fft(in, out, axis, inverse, real, plan, copies, s);
-    d.get_command_buffer(s.index)->addCompletedHandler(
-        [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+    d.add_temporaries(std::move(copies), s.index);
    return;
  }

@@ -700,7 +699,7 @@ void fft_op(
    auto kernel =
        get_fft_kernel(d, base_name, hash_name, func_consts, template_def);

-    compute_encoder->setComputePipelineState(kernel);
+    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in_contiguous, 0);
    compute_encoder.set_output_array(out, 1);

@@ -712,9 +711,9 @@ void fft_op(

      compute_encoder.set_input_array(w_q, 2); // w_q
      compute_encoder.set_input_array(w_k, 3); // w_k
-      compute_encoder->setBytes(&n, sizeof(int), 4);
-      compute_encoder->setBytes(&plan.bluestein_n, sizeof(int), 5);
-      compute_encoder->setBytes(&total_batch_size, sizeof(int), 6);
+      compute_encoder.set_bytes(n, 4);
+      compute_encoder.set_bytes(plan.bluestein_n, 5);
+      compute_encoder.set_bytes(total_batch_size, 6);
    } else if (plan.rader_n > 1) {
      auto [b_q, g_q, g_minus_q] = compute_raders_constants(plan.rader_n, s);
      copies.push_back(b_q);
@@ -724,25 +723,25 @@ void fft_op(
      compute_encoder.set_input_array(b_q, 2);
      compute_encoder.set_input_array(g_q, 3);
      compute_encoder.set_input_array(g_minus_q, 4);
-      compute_encoder->setBytes(&n, sizeof(int), 5);
-      compute_encoder->setBytes(&total_batch_size, sizeof(int), 6);
-      compute_encoder->setBytes(&plan.rader_n, sizeof(int), 7);
+      compute_encoder.set_bytes(n, 5);
+      compute_encoder.set_bytes(total_batch_size, 6);
+      compute_encoder.set_bytes(plan.rader_n, 7);
    } else if (four_step_params.required) {
-      compute_encoder->setBytes(&four_step_params.n1, sizeof(int), 2);
-      compute_encoder->setBytes(&four_step_params.n2, sizeof(int), 3);
-      compute_encoder->setBytes(&total_batch_size, sizeof(int), 4);
+      compute_encoder.set_bytes(four_step_params.n1, 2);
+      compute_encoder.set_bytes(four_step_params.n2, 3);
+      compute_encoder.set_bytes(total_batch_size, 4);
    } else {
-      compute_encoder->setBytes(&n, sizeof(int), 2);
-      compute_encoder->setBytes(&total_batch_size, sizeof(int), 3);
+      compute_encoder.set_bytes(n, 2);
+      compute_encoder.set_bytes(total_batch_size, 3);
    }

    auto group_dims = MTL::Size(1, threadgroup_batch_size, threads_per_fft);
    auto grid_dims =
        MTL::Size(batch_size, threadgroup_batch_size, threads_per_fft);
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
-  d.get_command_buffer(s.index)->addCompletedHandler(
-      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+
+  d.add_temporaries(std::move(copies), s.index);
 }

 void fft_op(
@@ -778,15 +777,14 @@ void nd_fft_op(
    // Mirror np.fft.(i)rfftn and perform a real transform
    // only on the final axis.
    bool step_real = (real && index == axes.size() - 1);
-    int step_shape = inverse ? out.shape(axis) : in.shape(axis);
+    auto step_shape = inverse ? out.shape(axis) : in.shape(axis);
    const array& in_arr = i == axes.size() - 1 ? in : temp_arrs[1 - i % 2];
    array& out_arr = i == 0 ? out : temp_arrs[i % 2];
    fft_op(in_arr, out_arr, axis, inverse, step_real, inplace, s);
  }

  auto& d = metal::device(s.device);
-  d.get_command_buffer(s.index)->addCompletedHandler(
-      [temp_arrs](MTL::CommandBuffer*) mutable { temp_arrs.clear(); });
+  d.add_temporaries(std::move(temp_arrs), s.index);
 }

 void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
--- a/mlx/backend/metal/hadamard.cpp
+++ b/mlx/backend/metal/hadamard.cpp
@@ -60,32 +60,6 @@ std::string gen_hadamard_codelet(int m) {
  return source.str();
 }

-void launch_hadamard(
-    const array& in,
-    array& out,
-    int batch_size,
-    int threads_per,
-    const std::string kernel_name,
-    float scale,
-    const Stream& s) {
-  auto& d = metal::device(s.device);
-
-  const auto& lib_name = kernel_name.substr(1);
-  auto lib = d.get_library(lib_name);
-  auto kernel = d.get_kernel(kernel_name, lib);
-  assert(threads_per <= kernel->maxTotalThreadsPerThreadgroup());
-
-  auto& compute_encoder = d.get_command_encoder(s.index);
-  compute_encoder->setComputePipelineState(kernel);
-  compute_encoder.set_input_array(in, 0);
-  compute_encoder.set_output_array(out, 1);
-  compute_encoder->setBytes(&scale, sizeof(float), 2);
-
-  MTL::Size group_dims = MTL::Size(1, threads_per, 1);
-  MTL::Size grid_dims = MTL::Size(batch_size, threads_per, 1);
-  compute_encoder->dispatchThreads(grid_dims, group_dims);
-}
-
 void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();

@@ -113,7 +87,8 @@ void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }

-  auto [n, m] = decompose_hadamard(in.shape(axis));
+  int n, m;
+  std::tie(n, m) = decompose_hadamard(in.shape(axis));

  if (n * (int)size_of(in.dtype()) > MAX_HADAMARD_BYTES) {
    throw std::invalid_argument(
@@ -129,8 +104,7 @@ void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto kernel_name = kname.str();
  auto& d = metal::device(s.device);
  const auto& lib_name = kernel_name;
-  auto lib = d.get_library(lib_name);
-  if (lib == nullptr) {
+  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    auto codelet = gen_hadamard_codelet(m);
    kernel_source << metal::utils() << codelet << metal::hadamard();
@@ -148,12 +122,31 @@ void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
        n,
        m,
        read_width);
-    lib = d.get_library(lib_name, kernel_source.str());
-  }
+    return kernel_source.str();
+  });

  int batch_size = in.size() / n;
  int threads_per = n / max_radix;

+  auto& compute_encoder = d.get_command_encoder(s.index);
+
+  auto launch_hadamard = [&](const array& in,
+                             array& out,
+                             const std::string& kernel_name,
+                             float scale) {
+    auto kernel = d.get_kernel(kernel_name, lib);
+    assert(threads_per <= kernel->maxTotalThreadsPerThreadgroup());
+
+    compute_encoder.set_compute_pipeline_state(kernel);
+    compute_encoder.set_input_array(in, 0);
+    compute_encoder.set_output_array(out, 1);
+    compute_encoder.set_bytes(scale, 2);
+
+    MTL::Size group_dims = MTL::Size(1, threads_per, 1);
+    MTL::Size grid_dims = MTL::Size(batch_size, threads_per, 1);
+    compute_encoder.dispatch_threads(grid_dims, group_dims);
+  };
+
  if (m > 1) {
    // When m is greater than 1, we decompose the
    // computation into two uploads to the GPU:
@@ -171,33 +164,17 @@ void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
    temp.set_data(allocator::malloc_or_wait(temp.nbytes()));
    copies.push_back(temp);

-    launch_hadamard(
-        in_contiguous,
-        temp,
-        batch_size,
-        threads_per,
-        "n" + kernel_name,
-        1.0,
-        s);
+    launch_hadamard(in_contiguous, temp, "n" + kernel_name, 1.0);

    // Metal sometimes reports 256 max threads per group for hadamard_m kernel
    threads_per = std::min(n / read_width, MAX_HADAMARD_THREADS_PER_GROUP);
    batch_size = in.size() / m / read_width / threads_per;
-    launch_hadamard(
-        temp, out, batch_size, threads_per, "m" + kernel_name, scale_, s);
+    launch_hadamard(temp, out, "m" + kernel_name, scale_);
  } else {
-    launch_hadamard(
-        in_contiguous,
-        out,
-        batch_size,
-        threads_per,
-        "n" + kernel_name,
-        scale_,
-        s);
+    launch_hadamard(in_contiguous, out, "n" + kernel_name, scale_);
  }

-  d.get_command_buffer(s.index)->addCompletedHandler(
-      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+  d.add_temporaries(std::move(copies), s.index);
 }

 } // namespace mlx::core
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -53,28 +53,31 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  size_t ndim = src.ndim();

-  std::string lib_name;
-  std::string kernel_name;
-  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
-  {
-    std::ostringstream kname;
-    kname << "gather" << type_to_name(out) << idx_type_name << "_" << nidx
-          << "_" << idx_ndim;
-    lib_name = kname.str();
-    kernel_name = lib_name;
-  }
+  bool large_index = nidx && inputs[1].size() > UINT32_MAX;
+  bool large_src = src.size() > UINT32_MAX;
+  bool large_out = out.size() > UINT32_MAX;
+  bool large = large_index || large_src || large_out;

-  auto lib = d.get_library(lib_name);
-  if (lib == nullptr) {
-    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::gather();
+  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
+  std::string kernel_name = fmt::format(
+      "gather{0}{1}_{2}_{3}_{4}",
+      type_to_name(out),
+      idx_type_name,
+      nidx,
+      idx_ndim,
+      large ? "int64_t" : "uint");
+  std::string lib_name = kernel_name;
+
+  auto lib = d.get_library(lib_name, [&]() {
+    std::string kernel_source = metal::utils();
+    kernel_source += metal::gather();
    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str =
        nidx ? get_type_string(inputs[1].dtype()) : "bool";
    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);

    // Index dimension specializations
-    kernel_source << fmt::format(
+    kernel_source += fmt::format(
        gather_kernels,
        type_to_name(out) + idx_type_name,
        out_type_str,
@@ -82,13 +85,14 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
        nidx,
        idx_args,
        idx_arr,
-        idx_ndim);
-    lib = d.get_library(lib_name, kernel_source.str());
-  }
+        idx_ndim,
+        large ? "int64_t" : "uint");
+    return kernel_source;
+  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);
-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  size_t slice_size = 1;
  for (auto s : slice_sizes_) {
@@ -114,17 +118,17 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  // Collect all idx shapes and strides into one place
  std::vector<int> idx_shapes;
  std::vector<size_t> idx_strides;
-
+  std::vector<char> idx_contigs;
  for (int i = 0; i < nidx; ++i) {
    idx_shapes.insert(
        idx_shapes.end(),
        inputs[i + 1].shape().begin(),
        inputs[i + 1].shape().end());
-
    idx_strides.insert(
        idx_strides.end(),
        inputs[i + 1].strides().begin(),
        inputs[i + 1].strides().end());
+    idx_contigs.push_back(inputs[i + 1].flags().row_contiguous);
  }

  // Set all the buffers
@@ -132,21 +136,20 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  compute_encoder.set_output_array(out, 1);

  // Set source info
-  compute_encoder->setBytes(src.shape().data(), ndim * sizeof(int), 2);
-  compute_encoder->setBytes(src.strides().data(), ndim * sizeof(size_t), 3);
-  compute_encoder->setBytes(&ndim, sizeof(size_t), 4);
-  compute_encoder->setBytes(slice_sizes_.data(), ndim * sizeof(int), 5);
-  compute_encoder->setBytes(axes_.data(), nidx * sizeof(int), 6);
+  compute_encoder.set_vector_bytes(src.shape(), 2);
+  compute_encoder.set_vector_bytes(src.strides(), 3);
+  compute_encoder.set_bytes(ndim, 4);
+  compute_encoder.set_vector_bytes(slice_sizes_, 5);
+  compute_encoder.set_vector_bytes(axes_, 6);

  // Set index info
  //
  // We don't need to check for empty idx_shapes because gather has a
  // idx_ndim == 0 specialization
-  compute_encoder->setBytes(
-      idx_shapes.data(), idx_shapes.size() * sizeof(int), 7);
-  compute_encoder->setBytes(
-      idx_strides.data(), idx_strides.size() * sizeof(size_t), 8);
-  compute_encoder->setBytes(&idx_ndim, sizeof(int), 9);
+  compute_encoder.set_vector_bytes(idx_shapes, 7);
+  compute_encoder.set_vector_bytes(idx_strides, 8);
+  compute_encoder.set_vector_bytes(idx_contigs, 9);
+  compute_encoder.set_bytes(idx_ndim, 10);

  // Set index buffers
  for (int i = 0; i < nidx; ++i) {
@@ -154,7 +157,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  // Launch grid
-  compute_encoder.dispatchThreads(grid_dims, group_dims);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }

 void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
@@ -173,12 +176,20 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  }

  // Copy src into out
-  auto copy_type =
-      inputs[0].data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  CopyType copy_type;
+  if (inputs[0].data_size() == 1) {
+    copy_type = CopyType::Scalar;
+  } else if (inputs[0].flags().row_contiguous) {
+    copy_type = CopyType::Vector;
+  } else {
+    copy_type = CopyType::General;
+  }
  copy_gpu(inputs[0], out, copy_type);

+  auto& upd = inputs.back();
+
  // Empty update
-  if (inputs.back().size() == 0) {
+  if (upd.size() == 0) {
    return;
  }

@@ -187,23 +198,22 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& d = metal::device(s.device);

  int idx_ndim = nidx ? inputs[1].ndim() : 0;
-  bool index_nd1_specialization = (idx_ndim == 1);
+  size_t idx_size = nidx ? inputs[1].size() : 1;

-  // Bail from fast path (1d index specialization) if scatter dims aren't
-  // the outermost dims and contiguous since update access won't be raster
-  // order.
-  for (auto i = 0; i < axes_.size() && index_nd1_specialization; i++) {
-    index_nd1_specialization &= (axes_[i] == i);
+  auto idx_to_out = idx_size / out.size();
+  int nwork;
+  if (idx_ndim <= 1 || idx_to_out < 1) {
+    nwork = 1;
+  } else if (idx_to_out <= 4) {
+    nwork = 4;
+  } else if (idx_to_out < 16) {
+    nwork = 8;
+  } else if (idx_to_out < 32) {
+    nwork = 16;
+  } else {
+    nwork = 32;
  }

-  // Bail from fast path (1d index specialization) if any of the dims are
-  // broadcasted, since we can't rely on linear indexing in that case.
-  for (int i = 1; i < inputs.size() && index_nd1_specialization; i++) {
-    index_nd1_specialization &= inputs[i].flags().row_contiguous;
-  }
-
-  std::string lib_name;
-  std::string kernel_name;
  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
  std::string op_name;
  switch (reduce_type_) {
@@ -223,24 +233,25 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
      op_name = "min";
      break;
  }
+  auto upd_contig = upd.flags().row_contiguous;
+  bool large_out = out.size() > UINT32_MAX;
+  bool large_idx = nidx && (inputs[1].size() > UINT32_MAX);
+  bool large_upd = upd.size() > UINT32_MAX;
+  bool large = large_out || large_idx || large_upd;
+  std::string kernel_name = fmt::format(
+      "scatter{0}{1}_{2}_{3}_{4}_nwork{5}_{6}",
+      type_to_name(out),
+      idx_type_name,
+      op_name,
+      nidx,
+      upd_contig ? "updc_true" : "updc_false",
+      nwork,
+      large ? "int64_t" : "uint");
+  std::string lib_name = kernel_name;

-  {
-    std::ostringstream kname;
-    if (index_nd1_specialization) {
-      kname << "scatter_1d_index" << type_to_name(out) << idx_type_name;
-    } else {
-      kname << "scatter" << type_to_name(out) << idx_type_name;
-    }
-    kname << "_" << op_name << "_" << nidx;
-    lib_name = kname.str();
-    kernel_name = kname.str();
-  }
-
-  auto lib = d.get_library(lib_name);
-  if (lib == nullptr) {
-    std::ostringstream kernel_source;
-    kernel_source << metal::utils() << metal::reduce_utils()
-                  << metal::scatter();
+  auto lib = d.get_library(lib_name, [&]() {
+    std::string kernel_source = metal::utils();
+    concatenate(kernel_source, metal::reduce_utils(), metal::scatter());

    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str =
@@ -264,11 +275,11 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
        break;
    }
    if (reduce_type_ != Scatter::None) {
-      op_type = fmt::format(op_type, out_type_str);
+      op_type = fmt::format(fmt::runtime(op_type), out_type_str);
    }
    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);

-    kernel_source << fmt::format(
+    kernel_source += fmt::format(
        scatter_kernels,
        type_to_name(out) + idx_type_name + "_" + op_name,
        out_type_str,
@@ -276,126 +287,105 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
        op_type,
        nidx,
        idx_args,
-        idx_arr);
-    lib = d.get_library(lib_name, kernel_source.str());
-  }
+        idx_arr,
+        upd_contig,
+        nwork,
+        large ? "int64_t" : "uint");
+    return kernel_source;
+  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);

-  auto& upd = inputs.back();
  size_t nthreads = upd.size();

-  compute_encoder->setComputePipelineState(kernel);
+  compute_encoder.set_compute_pipeline_state(kernel);

  // Set all the buffers
  compute_encoder.set_input_array(upd, 1);
  compute_encoder.set_output_array(out, 2);

  // Set update info
-  uint upd_ndim = upd.ndim();
+  size_t upd_ndim = upd.ndim();
  size_t upd_size = 1;
  for (int i = idx_ndim; i < upd.ndim(); ++i) {
    upd_size *= upd.shape(i);
  }
-  if (index_nd1_specialization) {
-    compute_encoder->setBytes(
-        out.shape().data(), out.shape().size() * sizeof(int), 3);
-    compute_encoder->setBytes(
-        out.strides().data(), out.strides().size() * sizeof(size_t), 4);
-
-    size_t out_ndim = out.ndim();
-    compute_encoder->setBytes(&out_ndim, sizeof(out_ndim), 5);
-    if (upd_ndim <= 1) {
-      // Placeholder so Metal doesn't compalain
-      int shape_ = 0;
-      compute_encoder->setBytes(&shape_, sizeof(int), 6);
-    } else {
-      compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 6);
-    }
-    compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 7);
-    compute_encoder->setBytes(&upd_size, sizeof(size_t), 8);
-
-    // Set index buffers
-    for (int i = 0; i < nidx; ++i) {
-      compute_encoder.set_input_array(inputs[i + 1], 20 + i);
-    }
-
-    // Launch grid
-    MTL::Size grid_dims = MTL::Size(upd_size, nthreads / upd_size, 1);
-    MTL::Size group_dims = get_block_dims(upd_size, nthreads / upd_size, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
-
-  } else {
-    // Collect all idx shapes and strides into one place
-    std::vector<int> idx_shapes;
-    std::vector<size_t> idx_strides;
-
-    for (int i = 0; i < nidx; ++i) {
-      idx_shapes.insert(
-          idx_shapes.end(),
-          inputs[i + 1].shape().begin(),
-          inputs[i + 1].shape().end());
-
-      idx_strides.insert(
-          idx_strides.end(),
-          inputs[i + 1].strides().begin(),
-          inputs[i + 1].strides().end());
-    }
-
-    if (upd_ndim == 0) {
-      // Need placeholders so Metal doesn't compalain
-      int shape_ = 0;
-      size_t stride_ = 0;
-      compute_encoder->setBytes(&shape_, sizeof(int), 3);
-      compute_encoder->setBytes(&stride_, sizeof(size_t), 4);
-    } else {
-      compute_encoder->setBytes(upd.shape().data(), upd_ndim * sizeof(int), 3);
-      compute_encoder->setBytes(
-          upd.strides().data(), upd_ndim * sizeof(size_t), 4);
-    }
-    compute_encoder->setBytes(&upd_ndim, sizeof(size_t), 5);
-    compute_encoder->setBytes(&upd_size, sizeof(size_t), 6);
-
-    // Set output info
-    size_t out_ndim = out.ndim();
-    if (out_ndim == 0) {
-      // Need placeholders so Metal doesn't compalain
-      int shape_ = 0;
-      size_t stride_ = 0;
-      compute_encoder->setBytes(&shape_, sizeof(int), 7);
-      compute_encoder->setBytes(&stride_, sizeof(size_t), 8);
-    } else {
-      compute_encoder->setBytes(out.shape().data(), out_ndim * sizeof(int), 7);
-      compute_encoder->setBytes(
-          out.strides().data(), out_ndim * sizeof(size_t), 8);
-    }
-    compute_encoder->setBytes(&out_ndim, sizeof(size_t), 9);
-    compute_encoder->setBytes(axes_.data(), axes_.size() * sizeof(int), 10);
-
-    // Set index info
-    if (idx_ndim == 0) {
-      // Add a 0 in idx_shapes and strides to avoid the missing buffer binding
-      // error in the metal API.
-      idx_shapes.push_back(0);
-      idx_strides.push_back(0);
-    }
-    compute_encoder->setBytes(
-        idx_shapes.data(), idx_shapes.size() * sizeof(int), 11);
-    compute_encoder->setBytes(
-        idx_strides.data(), idx_strides.size() * sizeof(size_t), 12);
-    compute_encoder->setBytes(&idx_ndim, sizeof(int), 13);
-
-    // Set index buffers
-    for (int i = 0; i < nidx; ++i) {
-      compute_encoder.set_input_array(inputs[i + 1], 20 + i);
-    }
-
-    // Launch grid
-    MTL::Size grid_dims = MTL::Size(upd_size, nthreads / upd_size, 1);
-    MTL::Size group_dims = get_block_dims(upd_size, nthreads / upd_size, 1);
-    compute_encoder.dispatchThreads(grid_dims, group_dims);
+  // Collect all idx shapes and strides into one place
+  Shape idx_shapes;
+  Strides idx_strides;
+  // To access .data() use char instead of bool
+  // bool is 1 byte in Metal so this is safe
+  std::vector<char> idx_contigs;
+  for (int i = 0; i < nidx; ++i) {
+    idx_shapes.insert(
+        idx_shapes.end(),
+        inputs[i + 1].shape().begin(),
+        inputs[i + 1].shape().end());
+    idx_strides.insert(
+        idx_strides.end(),
+        inputs[i + 1].strides().begin(),
+        inputs[i + 1].strides().end());
+    idx_contigs.push_back(inputs[i + 1].flags().row_contiguous);
  }
+
+  if (upd_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    int64_t stride_ = 0;
+    compute_encoder.set_bytes(shape_, 3);
+    compute_encoder.set_bytes(stride_, 4);
+  } else {
+    compute_encoder.set_vector_bytes(upd.shape(), 3);
+    compute_encoder.set_vector_bytes(upd.strides(), 4);
+  }
+  compute_encoder.set_bytes(upd_ndim, 5);
+  compute_encoder.set_bytes(upd_size, 6);
+
+  // Set output info
+  size_t out_ndim = out.ndim();
+  if (out_ndim == 0) {
+    // Need placeholders so Metal doesn't compalain
+    int shape_ = 0;
+    int64_t stride_ = 0;
+    compute_encoder.set_bytes(shape_, 7);
+    compute_encoder.set_bytes(stride_, 8);
+  } else {
+    compute_encoder.set_vector_bytes(out.shape(), 7);
+    compute_encoder.set_vector_bytes(out.strides(), 8);
+  }
+  compute_encoder.set_bytes(out_ndim, 9);
+  compute_encoder.set_vector_bytes(axes_, 10);
+
+  // Set index info
+  if (idx_ndim == 0) {
+    // Add a 0 in idx_shapes and strides to avoid the missing buffer binding
+    // error in the metal API.
+    idx_shapes.push_back(0);
+    idx_strides.push_back(0);
+    idx_contigs.push_back(false);
+  }
+  compute_encoder.set_vector_bytes(idx_shapes, 11);
+  compute_encoder.set_vector_bytes(idx_strides, 12);
+  compute_encoder.set_vector_bytes(idx_contigs, 13);
+  compute_encoder.set_bytes(idx_ndim, 14);
+  compute_encoder.set_bytes(idx_size, 15);
+
+  // Set index buffers
+  for (int i = 0; i < nidx; ++i) {
+    compute_encoder.set_input_array(inputs[i + 1], 20 + i);
+  }
+
+  // Launch grid
+  auto grid_y = (nthreads / upd_size);
+  grid_y = (grid_y + nwork - 1) / nwork;
+  MTL::Size grid_dims = MTL::Size(upd_size, grid_y, 1);
+  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
+  if (thread_group_size != 1024) {
+    throw std::runtime_error("[Scatter::eval_gpu] Invalid number of threads");
+  }
+  MTL::Size group_dims = get_block_dims(upd_size, grid_y, 1);
+  compute_encoder.dispatch_threads(grid_dims, group_dims);
 }

 } // namespace mlx::core
--- a/mlx/backend/metal/jit/copy.h
+++ b/mlx/backend/metal/jit/copy.h
@@ -1,100 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-constexpr std::string_view copy_kernels = R"(
-template [[host_name("s_{0}")]] [[kernel]] void copy_s<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    uint index [[thread_position_in_grid]]);
-template [[host_name("v_{0}")]] [[kernel]] void copy_v<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    uint index [[thread_position_in_grid]]);
-
-template [[host_name("g4_{0}")]] [[kernel]] void
-copy_g_nd<{1}, {2}, 4>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("gg4_{0}")]] [[kernel]] void
-copy_gg_nd<{1}, {2}, 4>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint3 index [[thread_position_in_grid]]);
-template [[host_name("g5_{0}")]] [[kernel]] void
-copy_g_nd<{1}, {2}, 5>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("gg5_{0}")]] [[kernel]] void
-copy_gg_nd<{1}, {2}, 5>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint3 index [[thread_position_in_grid]]);
-template [[host_name("g1_{0}")]] [[kernel]] void copy_g_nd1<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t& src_stride [[buffer(3)]],
-    uint index [[thread_position_in_grid]]);
-template [[host_name("g2_{0}")]] [[kernel]] void copy_g_nd2<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint2 index [[thread_position_in_grid]],
-    uint2 grid_dim [[threads_per_grid]]);
-template [[host_name("g3_{0}")]] [[kernel]] void copy_g_nd3<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("gg1_{0}")]] [[kernel]] void
-copy_gg_nd1<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t& src_stride [[buffer(3)]],
-    constant const int64_t& dst_stride [[buffer(4)]],
-    uint index [[thread_position_in_grid]]);
-template [[host_name("gg2_{0}")]] [[kernel]] void
-copy_gg_nd2<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint2 index [[thread_position_in_grid]]);
-template [[host_name("gg3_{0}")]] [[kernel]] void
-copy_gg_nd3<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    uint3 index [[thread_position_in_grid]]);
-
-template [[host_name("g_{0}")]] [[kernel]] void copy_g<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int& ndim [[buffer(5)]],
-    uint3 index [[thread_position_in_grid]],
-    uint3 grid_dim [[threads_per_grid]]);
-template [[host_name("gg_{0}")]] [[kernel]] void copy_gg<{1}, {2}>(
-    device const {1}* src [[buffer(0)]],
-    device {2}* dst [[buffer(1)]],
-    constant const int* src_shape [[buffer(2)]],
-    constant const int64_t* src_strides [[buffer(3)]],
-    constant const int64_t* dst_strides [[buffer(4)]],
-    constant const int& ndim [[buffer(5)]],
-    uint3 index [[thread_position_in_grid]]);
-)";
--- a/mlx/backend/metal/jit/gemv_masked.h
+++ b/mlx/backend/metal/jit/gemv_masked.h
@@ -11,13 +11,13 @@ gemv_{trans}masked<{itype}, {outm_t}, {opm_t}, {bm}, {bn}, {sm}, {sn}, {tm}, {tn
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device {outm_t}* out_mask [[buffer(20)]],
    const device {opm_t}* mat_mask [[buffer(21)]],
    const device {opm_t}* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
-    const constant size_t* mask_batch_strides [[buffer(24)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
--- a/Show More
+++ b/Show More