binding + tests

works
try dynamic reshape
2025-09-08 01:54:37 +08:00 · 2024-12-09 12:57:36 -08:00 · 2024-12-09 12:57:36 -08:00 · 2024-12-09 12:57:36 -08:00
294 changed files with 9423 additions and 16884 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -85,7 +85,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -137,7 +137,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -160,7 +160,6 @@ jobs:
            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
-            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py
      - run:
          name: Build example extension
          command: |
@@ -227,7 +226,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -292,7 +291,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.4.0
+            pip install nanobind==2.2.0
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
--- a/.gitignore
+++ b/.gitignore
@@ -76,9 +76,6 @@ build/
 *.out
 *.app

-# Debug symbols
-*.pdb
-
 # VSCode 
 .vscode/
 .DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,16 @@
 repos:
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.7
+    rev: v19.1.4
    hooks:
    -   id: clang-format
 # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
 -   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 25.1.0
+    rev: 24.10.0
    hooks:
    -   id: black
    
 -   repo: https://github.com/pycqa/isort
-    rev: 6.0.0
+    rev: 5.13.2
    hooks:
    -   id: isort
        args:
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -7,7 +7,7 @@ with a short description of your contribution(s) below. For example:

 MLX was developed with contributions from the following individuals:

- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`. Added `orthogonal` initializer.
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`.
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
 - Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.25)
+cmake_minimum_required(VERSION 3.24)

 project(mlx LANGUAGES C CXX)

@@ -20,14 +20,12 @@ option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
-option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.22.1)
+  set(MLX_VERSION 0.21.1)
 endif()
-add_compile_definitions("MLX_VERSION=${MLX_VERSION}")

 # --------------------- Processor tests -------------------------

@@ -95,7 +93,8 @@ elseif(MLX_BUILD_METAL)
  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")

  set(METAL_CPP_URL
-      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip
+  )

  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
@@ -114,56 +113,16 @@ elseif(MLX_BUILD_METAL)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

-if(WIN32)
-  if(MSVC)
-    # GGUF does not build with MSVC.
-    set(MLX_BUILD_GGUF OFF)
-    # There is no prebuilt OpenBLAS distribution for MSVC.
-    set(MLX_BUILD_BLAS_FROM_SOURCE ON)
-  endif()
-  # Windows implementation of dlfcn.h APIs.
-  FetchContent_Declare(
-    dlfcn-win32
-    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
-    GIT_TAG v1.4.1
-    EXCLUDE_FROM_ALL)
-  block()
-  set(BUILD_SHARED_LIBS OFF)
-  FetchContent_MakeAvailable(dlfcn-win32)
-  endblock()
-  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
-  target_link_libraries(mlx PRIVATE dl)
-endif()
-
 if(MLX_BUILD_CPU)
  find_library(ACCELERATE_LIBRARY Accelerate)
  if(ACCELERATE_LIBRARY)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
+    add_compile_definitions(ACCELERATE_NEW_LAPACK)
  else()
    message(STATUS "Accelerate or arm neon not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
-  endif()
-
-  if(MLX_BUILD_ACCELERATE)
-    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
-    add_compile_definitions(MLX_USE_ACCELERATE)
-    add_compile_definitions(ACCELERATE_NEW_LAPACK)
-  elseif(MLX_BUILD_BLAS_FROM_SOURCE)
-    # Download and build OpenBLAS from source code.
-    FetchContent_Declare(
-      openblas
-      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
-      GIT_TAG v0.3.28
-      EXCLUDE_FROM_ALL)
-    set(BUILD_STATIC_LIBS ON) # link statically
-    set(NOFORTRAN ON) # msvc has no fortran compiler
-    FetchContent_MakeAvailable(openblas)
-    target_link_libraries(mlx PRIVATE openblas)
-    target_include_directories(
-      mlx PRIVATE "${openblas_SOURCE_DIR}/lapack-netlib/LAPACKE/include"
-                  "${CMAKE_BINARY_DIR}/generated" "${CMAKE_BINARY_DIR}")
-  else()
    if(${CMAKE_HOST_APPLE})
      # The blas shipped in macOS SDK is not supported, search homebrew for
      # openblas instead.
@@ -181,7 +140,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
-    target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
+    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
    # List blas after lapack otherwise we may accidentally incldue an old
    # version of lapack.h from the include dirs of blas.
    find_package(BLAS REQUIRED)
@@ -194,7 +153,14 @@ if(MLX_BUILD_CPU)
    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-    target_link_libraries(mlx PRIVATE ${BLAS_LIBRARIES})
+    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
+
+    if(WIN32)
+      find_package(dlfcn-win32 REQUIRED)
+      message(STATUS "dlfcn-win32 lib " ${dlfcn-win32_LIBRARIES})
+      message(STATUS "dlfcn-win32 include " ${dlfcn-win32_INCLUDE_DIRS})
+      target_link_libraries(mlx PUBLIC ${dlfcn-win32_LIBRARIES})
+    endif()
  endif()
 else()
  set(MLX_BUILD_ACCELERATE OFF)
@@ -241,7 +207,8 @@ if(MLX_BUILD_PYTHON_BINDINGS)
  execute_process(
    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
    OUTPUT_STRIP_TRAILING_WHITESPACE
-    OUTPUT_VARIABLE nanobind_ROOT)
+    OUTPUT_VARIABLE NB_DIR)
+  list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
  find_package(nanobind CONFIG REQUIRED)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
 endif()
--- a/benchmarks/cpp/autograd.cpp
+++ b/benchmarks/cpp/autograd.cpp
@@ -5,35 +5,35 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 void time_value_and_grad() {
-  auto x = mx::ones({200, 1000});
-  mx::eval(x);
-  auto fn = [](mx::array x) {
+  auto x = ones({200, 1000});
+  eval(x);
+  auto fn = [](array x) {
    for (int i = 0; i < 20; ++i) {
-      x = mx::log(mx::exp(x));
+      x = log(exp(x));
    }
-    return mx::sum(x);
+    return sum(x);
  };

-  auto grad_fn = mx::grad(fn);
+  auto grad_fn = grad(fn);
  auto independent_value_and_grad = [&]() {
    auto value = fn(x);
    auto dfdx = grad_fn(x);
-    return std::vector<mx::array>{value, dfdx};
+    return std::vector<array>{value, dfdx};
  };
  TIME(independent_value_and_grad);

-  auto value_and_grad_fn = mx::value_and_grad(fn);
+  auto value_and_grad_fn = value_and_grad(fn);
  auto combined_value_and_grad = [&]() {
    auto [value, dfdx] = value_and_grad_fn(x);
-    return std::vector<mx::array>{value, dfdx};
+    return std::vector<array>{value, dfdx};
  };
  TIME(combined_value_and_grad);
 }

 int main() {
-  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  std::cout << "Benchmarks for " << default_device() << std::endl;
  time_value_and_grad();
 }
--- a/benchmarks/cpp/compare_devices.cpp
+++ b/benchmarks/cpp/compare_devices.cpp
@@ -4,21 +4,21 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 void time_add_op() {
  std::vector<int> sizes(1, 1);
  for (int i = 0; i < 9; ++i) {
    sizes.push_back(10 * sizes.back());
  }
-  set_default_device(mx::Device::cpu);
+  set_default_device(Device::cpu);
  for (auto size : sizes) {
-    auto a = mx::random::uniform({size});
-    auto b = mx::random::uniform({size});
-    mx::eval(a, b);
+    auto a = random::uniform({size});
+    auto b = random::uniform({size});
+    eval(a, b);
    std::cout << "Size " << size << std::endl;
-    TIMEM("cpu", mx::add, a, b, mx::Device::cpu);
-    TIMEM("gpu", mx::add, a, b, mx::Device::gpu);
+    TIMEM("cpu", add, a, b, Device::cpu);
+    TIMEM("gpu", add, a, b, Device::gpu);
  }
 }

--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -6,105 +6,105 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 void time_irregular_binary_ops_1D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  int size = 1000000;
  int step = 2;
-  auto a = mx::random::uniform({size});
-  auto b = mx::random::uniform({size});
-  mx::eval(a, b);
+  auto a = random::uniform({size});
+  auto b = random::uniform({size});
+  eval(a, b);
  a = slice(a, {0}, {size}, {step});
  b = slice(b, {0}, {size}, {step});
-  TIMEM("1D strided", mx::add, a, b, device);
+  TIMEM("1D strided", add, a, b, device);
 }

 void time_irregular_binary_ops_2D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  int size = 2048;
-  auto a = mx::random::uniform({size, size});
-  auto b = mx::random::uniform({size, size});
-  mx::eval(a, b);
-  TIMEM("2D regular", mx::add, a, b, device);
+  auto a = random::uniform({size, size});
+  auto b = random::uniform({size, size});
+  eval(a, b);
+  TIMEM("2D regular", add, a, b, device);

-  b = mx::transpose(b);
-  mx::eval(b);
-  TIMEM("2D mx::transpose", mx::add, a, b, device);
+  b = transpose(b);
+  eval(b);
+  TIMEM("2D transpose", add, a, b, device);

-  b = mx::random::uniform({size});
-  mx::eval(b);
-  TIMEM("2D broadcast dim 0", mx::add, a, b, device);
+  b = random::uniform({size});
+  eval(b);
+  TIMEM("2D broadcast dim 0", add, a, b, device);

-  b = mx::reshape(b, {size, 1});
-  mx::eval(b);
-  TIMEM("2D broadcast dim 1", mx::add, a, b, device);
+  b = reshape(b, {size, 1});
+  eval(b);
+  TIMEM("2D broadcast dim 1", add, a, b, device);
 }

 void time_irregular_binary_ops_3D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  int d0 = 32;
  int d1 = 512;
  int d2 = 512;
-  auto a = mx::random::uniform({d0, d1, d2});
-  auto b = mx::random::uniform({d0, d1, d2});
-  TIMEM("3D regular", mx::add, a, b, device);
+  auto a = random::uniform({d0, d1, d2});
+  auto b = random::uniform({d0, d1, d2});
+  TIMEM("3D regular", add, a, b, device);

-  b = mx::transpose(b, {0, 2, 1});
-  TIMEM("3D mx::transpose", mx::add, a, b, device);
+  b = transpose(b, {0, 2, 1});
+  TIMEM("3D transpose", add, a, b, device);

-  b = mx::random::uniform({d1, d2});
-  TIMEM("3D broadcast dim 0", mx::add, a, b, device);
+  b = random::uniform({d1, d2});
+  TIMEM("3D broadcast dim 0", add, a, b, device);

-  b = mx::random::uniform({d0, 1, d2});
-  TIMEM("3D broadcast dim 1", mx::add, a, b, device);
+  b = random::uniform({d0, 1, d2});
+  TIMEM("3D broadcast dim 1", add, a, b, device);

-  b = mx::random::uniform({d0, d1, 1});
-  TIMEM("3D broadcast dim 2", mx::add, a, b, device);
+  b = random::uniform({d0, d1, 1});
+  TIMEM("3D broadcast dim 2", add, a, b, device);

-  b = mx::random::uniform({d2});
-  TIMEM("3D broadcast dims 0, 1", mx::add, a, b, device);
+  b = random::uniform({d2});
+  TIMEM("3D broadcast dims 0, 1", add, a, b, device);

-  b = mx::random::uniform({d1, 1});
-  TIMEM("3D broadcast dims 0, 2", mx::add, a, b, device);
+  b = random::uniform({d1, 1});
+  TIMEM("3D broadcast dims 0, 2", add, a, b, device);

-  b = mx::random::uniform({d0, 1, 1});
-  TIMEM("3D broadcast dims 1, 2", mx::add, a, b, device);
+  b = random::uniform({d0, 1, 1});
+  TIMEM("3D broadcast dims 1, 2", add, a, b, device);
 }

 void time_irregular_binary_ops_4D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  std::vector<int> shape = {8, 8, 512, 512};
-  auto a = mx::random::uniform(shape);
-  auto b = mx::random::uniform(shape);
+  auto a = random::uniform(shape);
+  auto b = random::uniform(shape);

-  TIMEM("4D regular", mx::add, a, b, device);
+  TIMEM("4D regular", add, a, b, device);

-  b = mx::transpose(b, {0, 1, 3, 2});
-  TIMEM("4D mx::transpose", mx::add, a, b, device);
+  b = transpose(b, {0, 1, 3, 2});
+  TIMEM("4D transpose", add, a, b, device);

  std::string om = "4D broadcast dims ";
  for (int i = 0; i < shape.size(); ++i) {
    shape[i] = 1;
-    b = mx::random::uniform(shape);
+    b = random::uniform(shape);
    std::ostringstream msg;
    msg << om << i;
-    TIMEM(msg.str(), mx::add, a, b, device);
+    TIMEM(msg.str(), add, a, b, device);

    for (int j = i + 1; j < shape.size(); ++j) {
      shape[j] = 1;
      std::ostringstream msg;
      msg << om << i << ", " << j;
-      b = mx::random::uniform(shape);
-      TIMEM(msg.str(), mx::add, a, b, device);
+      b = random::uniform(shape);
+      TIMEM(msg.str(), add, a, b, device);
      shape[j] = a.shape(j);

      for (int k = j + 1; k < shape.size(); ++k) {
        shape[k] = 1;
        std::ostringstream msg;
        msg << om << i << ", " << j << ", " << k;
-        b = mx::random::uniform(shape);
-        TIMEM(msg.str(), mx::add, a, b, device);
+        b = random::uniform(shape);
+        TIMEM(msg.str(), add, a, b, device);
        shape[k] = a.shape(k);
      }
    }
@@ -113,83 +113,83 @@ void time_irregular_binary_ops_4D() {
 }

 void time_irregular_reshape() {
-  auto device = mx::default_device();
+  auto device = default_device();
  std::vector<int> shape;
-  auto reshape_fn = [&shape, device](const mx::array& a) {
-    return mx::reshape(a, shape, device);
+  auto reshape_fn = [&shape, device](const array& a) {
+    return reshape(a, shape, device);
  };

  int size = 64;
  int d = 2 * size;

-  auto a = mx::random::uniform({d, d, d});
+  auto a = random::uniform({d, d, d});

  shape = {8 * size, size, size};
  TIMEM("3D contiguous", reshape_fn, a);

-  a = mx::transpose(a);
+  a = transpose(a);
  shape = {8 * size, size, size};
-  TIMEM("3D mx::transpose", reshape_fn, a);
+  TIMEM("3D transpose", reshape_fn, a);

-  a = mx::transpose(a, {1, 2, 0});
+  a = transpose(a, {1, 2, 0});
  shape = {8 * size, size, size};
-  TIMEM("3D mx::transpose dims 1 2", reshape_fn, a);
+  TIMEM("3D transpose dims 1 2", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d, d}), {d, d, d});
+  a = broadcast_to(random::uniform({d, d}), {d, d, d});
  TIMEM("3D broadcast dim 0", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d, 1, d}), {d, d, d});
+  a = broadcast_to(random::uniform({d, 1, d}), {d, d, d});
  TIMEM("3D broadcast dim 1", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d, d, 1}), {d, d, d});
+  a = broadcast_to(random::uniform({d, d, 1}), {d, d, d});
  TIMEM("3D broadcast dim 2", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d}), {d, d, d});
+  a = broadcast_to(random::uniform({d}), {d, d, d});
  TIMEM("3D broadcast dims 0, 1", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d, 1}), {d, d, d});
+  a = broadcast_to(random::uniform({d, 1}), {d, d, d});
  TIMEM("3D broadcast dims 0, 2", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({d, 1, 1}), {d, d, d});
+  a = broadcast_to(random::uniform({d, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2", reshape_fn, a);

-  a = mx::broadcast_to(mx::random::uniform({1, 1, 1}), {d, d, d});
+  a = broadcast_to(random::uniform({1, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2, 3", reshape_fn, a);
 }

 void time_irregular_astype_1D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  int size = 1000000;
  int step = 2;
-  auto a = mx::random::uniform({size});
+  auto a = random::uniform({size});
  a = slice(a, {0}, {size}, {step});
-  TIMEM("1D strided", mx::astype, a, mx::int32, device);
+  TIMEM("1D strided", astype, a, int32, device);
 }

 void time_irregular_astype_2D() {
-  auto device = mx::default_device();
+  auto device = default_device();
  int size = 2048;
  std::vector<int> shape = {size, size};

-  auto a = mx::random::uniform(shape);
-  TIMEM("2D regular", mx::astype, a, mx::int32, device);
+  auto a = random::uniform(shape);
+  TIMEM("2D regular", astype, a, int32, device);

-  a = mx::transpose(a);
-  TIMEM("2D mx::transpose", mx::astype, a, mx::int32, device);
+  a = transpose(a);
+  TIMEM("2D transpose", astype, a, int32, device);

-  a = mx::broadcast_to(mx::random::uniform({size}), shape);
-  TIMEM("2D broadcast dim 0", mx::astype, a, mx::int32, device);
+  a = broadcast_to(random::uniform({size}), shape);
+  TIMEM("2D broadcast dim 0", astype, a, int32, device);

-  a = mx::broadcast_to(mx::random::uniform({size, 1}), shape);
-  TIMEM("2D broadcast dim 1", mx::astype, a, mx::int32, device);
+  a = broadcast_to(random::uniform({size, 1}), shape);
+  TIMEM("2D broadcast dim 1", astype, a, int32, device);
 }

 int main(int argc, char** argv) {
  if (argc > 1) {
    bool use_gpu = !strcmp(argv[1], "gpu");
-    set_default_device(use_gpu ? mx::Device::gpu : mx::Device::cpu);
+    set_default_device(use_gpu ? Device::gpu : Device::cpu);
  }
-  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  std::cout << "Benchmarks for " << default_device() << std::endl;
  time_irregular_binary_ops_1D();
  time_irregular_binary_ops_2D();
  time_irregular_binary_ops_3D();
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -3,20 +3,20 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 void time_creation_ops() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto full_fp32 = [&]() { return mx::full(shape, 3.3f); };
+  auto full_fp32 = [&]() { return full(shape, 3.3f); };
  TIME(full_fp32);
-  auto zeros_fp32 = [&]() { return mx::zeros(shape, mx::float32); };
+  auto zeros_fp32 = [&]() { return zeros(shape, float32); };
  TIME(zeros_fp32);
-  auto ones_fp32 = [&]() { return mx::ones(shape, mx::float32); };
+  auto ones_fp32 = [&]() { return ones(shape, float32); };
  TIME(ones_fp32);

-  auto arange_fp32 = [&]() { return mx::arange(0.0, 10.0, 1e-4); };
+  auto arange_fp32 = [&]() { return arange(0.0, 10.0, 1e-4); };
  TIME(arange_fp32);
 }

@@ -24,196 +24,194 @@ void time_type_conversions() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto device = mx::default_device();
+  auto device = default_device();

-  auto a = mx::zeros(shape, mx::float32);
-  mx::eval(a);
-  TIMEM("mx::float32 to mx::int32", mx::astype, a, mx::int32, device);
-  TIMEM("mx::float32 to mx::uint32", mx::astype, a, mx::uint32, device);
+  auto a = zeros(shape, float32);
+  eval(a);
+  TIMEM("float32 to int32", astype, a, int32, device);
+  TIMEM("float32 to uint32", astype, a, uint32, device);

-  a = mx::zeros(shape, mx::int32);
-  mx::eval(a);
-  TIMEM("mx::int32 to mx::float32", mx::astype, a, mx::float32, device);
+  a = zeros(shape, int32);
+  eval(a);
+  TIMEM("int32 to float32", astype, a, float32, device);

-  a = mx::zeros(shape, mx::bool_);
-  mx::eval(a);
-  TIMEM("bool to mx::float32", mx::astype, a, mx::float32, device);
-  TIMEM("bool to mx::int32", mx::astype, a, mx::int32, device);
-  TIMEM("bool to mx::uint32", mx::astype, a, mx::uint32, device);
+  a = zeros(shape, bool_);
+  eval(a);
+  TIMEM("bool to float32", astype, a, float32, device);
+  TIMEM("bool to int32", astype, a, int32, device);
+  TIMEM("bool to uint32", astype, a, uint32, device);
 }

 void time_random_generation() {
  int M = 2000;
  int N = 500;

-  auto uniform = [&]() { return mx::random::uniform({M, N}, mx::float32); };
+  auto uniform = [&]() { return random::uniform({M, N}, float32); };
  TIME(uniform);
-  auto normal = [&]() { return mx::random::normal({M, N}, mx::float32); };
+  auto normal = [&]() { return random::normal({M, N}, float32); };
  TIME(normal);
 }

 void time_unary_ops() {
  int M = 2000;
  int N = 500;
-  auto device = mx::default_device();
+  auto device = default_device();

-  auto a = mx::random::normal({M, N});
-  mx::eval(a);
+  auto a = random::normal({M, N});
+  eval(a);
  TIME(mlx::core::abs, a, device);
-  TIME(mx::negative, a, device);
-  TIME(mx::sign, a, device);
-  TIME(mx::square, a, device);
+  TIME(negative, a, device);
+  TIME(sign, a, device);
+  TIME(square, a, device);
  TIME(mlx::core::sqrt, a, device);
-  TIME(mx::rsqrt, a, device);
+  TIME(rsqrt, a, device);
  TIME(mlx::core::exp, a, device);

-  a = mx::random::uniform({M, N});
+  a = random::uniform({M, N});
  TIME(mlx::core::log, a, device);
 }

 void time_binary_ops() {
  int M = 1000, N = 100, K = 10;
-  auto condition = mx::random::randint(0, 2, {M, N, K});
-  auto a = mx::random::uniform({M, N, K});
-  auto b = mx::random::uniform({M, N, K});
-  auto device = mx::default_device();
-  mx::eval(a, b);
+  auto condition = random::randint(0, 2, {M, N, K});
+  auto a = random::uniform({M, N, K});
+  auto b = random::uniform({M, N, K});
+  auto device = default_device();
+  eval(a, b);

-  TIME(mx::add, a, b, device);
-  TIME(mx::subtract, a, b, device);
-  TIME(mx::multiply, a, b, device);
-  TIME(mx::divide, a, b, device);
-  TIME(mx::maximum, a, b, device);
-  TIME(mx::minimum, a, b, device);
-  TIME(mx::where, condition, a, b, device);
+  TIME(add, a, b, device);
+  TIME(subtract, a, b, device);
+  TIME(multiply, a, b, device);
+  TIME(divide, a, b, device);
+  TIME(maximum, a, b, device);
+  TIME(minimum, a, b, device);
+  TIME(where, condition, a, b, device);

-  condition = mx::array({true});
-  b = mx::random::uniform({1});
-  mx::eval(b);
-  TIMEM("scalar", mx::add, a, b, device);
-  TIMEM("vector-scalar", mx::subtract, a, b, device);
-  TIMEM("scalar-vector", mx::subtract, b, a, device);
-  TIMEM("scalar", mx::multiply, a, b, device);
-  TIMEM("vector-scalar", mx::divide, a, b, device);
-  TIMEM("scalar-vector", mx::divide, b, a, device);
-  TIMEM("scalar-vector", mx::where, condition, a, b, device);
+  condition = array({true});
+  b = random::uniform({1});
+  eval(b);
+  TIMEM("scalar", add, a, b, device);
+  TIMEM("vector-scalar", subtract, a, b, device);
+  TIMEM("scalar-vector", subtract, b, a, device);
+  TIMEM("scalar", multiply, a, b, device);
+  TIMEM("vector-scalar", divide, a, b, device);
+  TIMEM("scalar-vector", divide, b, a, device);
+  TIMEM("scalar-vector", where, condition, a, b, device);

-  condition = mx::broadcast_to(mx::array({true}), {1000, 100});
-  a = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
-  b = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
-  mx::eval(a, b);
-  TIMEM("scalar-scalar broadcast", mx::add, a, b, device);
-  TIMEM("scalar-scalar broadcast", mx::subtract, a, b, device);
-  TIMEM("scalar-scalar broadcast", mx::multiply, a, b, device);
-  TIMEM("scalar-scalar broadcast", mx::divide, a, b, device);
-  TIMEM("scalar-scalar broadcast", mx::where, condition, a, b, device);
+  condition = broadcast_to(array({true}), {1000, 100});
+  a = broadcast_to(random::uniform({1}), {1000, 100});
+  b = broadcast_to(random::uniform({1}), {1000, 100});
+  eval(a, b);
+  TIMEM("scalar-scalar broadcast", add, a, b, device);
+  TIMEM("scalar-scalar broadcast", subtract, a, b, device);
+  TIMEM("scalar-scalar broadcast", multiply, a, b, device);
+  TIMEM("scalar-scalar broadcast", divide, a, b, device);
+  TIMEM("scalar-scalar broadcast", where, condition, a, b, device);
 }

 void time_strided_ops() {
  int M = 50, N = 50, O = 50, P = 50;
-  auto a = mx::random::uniform({M, N, O, P});
-  auto b = mx::random::uniform({M, N, O, P});
-  auto device = mx::default_device();
-  mx::eval(a, b);
-  TIMEM("non-strided", mx::add, a, b, device);
-  a = mx::transpose(a, {1, 0, 2, 3});
-  b = mx::transpose(b, {3, 2, 0, 1});
-  mx::eval(a, b);
-  TIMEM("strided", mx::add, a, b, device);
+  auto a = random::uniform({M, N, O, P});
+  auto b = random::uniform({M, N, O, P});
+  auto device = default_device();
+  eval(a, b);
+  TIMEM("non-strided", add, a, b, device);
+  a = transpose(a, {1, 0, 2, 3});
+  b = transpose(b, {3, 2, 0, 1});
+  eval(a, b);
+  TIMEM("strided", add, a, b, device);
 }

 void time_comparisons() {
  int M = 1000, N = 100, K = 10;
-  auto a = mx::random::uniform({M, N, K});
-  auto b = mx::random::uniform({M, N, K});
-  auto device = mx::default_device();
-  mx::eval(a, b);
-  TIME(mx::equal, a, b, device);
-  TIME(mx::greater, a, b, device);
-  TIME(mx::greater_equal, a, b, device);
-  TIME(mx::less, a, b, device);
-  TIME(mx::less_equal, a, b, device);
+  auto a = random::uniform({M, N, K});
+  auto b = random::uniform({M, N, K});
+  auto device = default_device();
+  eval(a, b);
+  TIME(equal, a, b, device);
+  TIME(greater, a, b, device);
+  TIME(greater_equal, a, b, device);
+  TIME(less, a, b, device);
+  TIME(less_equal, a, b, device);
 }

 void time_matvec() {
  int M = 2000, N = 200;
-  auto a = mx::random::uniform({M, N});
-  auto b = mx::random::uniform({N});
-  auto c = mx::random::uniform({M});
-  mx::eval(a, b, c);
-  auto matvec = [&]() { return mx::matmul(a, b); };
+  auto a = random::uniform({M, N});
+  auto b = random::uniform({N});
+  auto c = random::uniform({M});
+  eval(a, b, c);
+  auto matvec = [&]() { return matmul(a, b); };
  TIME(matvec);

-  auto matvec_transpose = [&]() { return mx::matmul(mx::transpose(a), c); };
+  auto matvec_transpose = [&]() { return matmul(transpose(a), c); };
  TIME(matvec_transpose);
 }

 void time_matmul() {
  int M = 1000, N = 1000, K = 1000;
-  auto a = mx::random::uniform({M, K});
-  auto b = mx::random::uniform({K, N});
-  auto device = mx::default_device();
-  mx::eval(a, b);
-  TIME(mx::matmul, a, b, device);
+  auto a = random::uniform({M, K});
+  auto b = random::uniform({K, N});
+  auto device = default_device();
+  eval(a, b);
+  TIME(matmul, a, b, device);

-  auto transpose_matmul = [&]() { return mx::matmul(mx::transpose(a), b); };
+  auto transpose_matmul = [&]() { return matmul(transpose(a), b); };
  TIME(transpose_matmul);
 }

 void time_reductions() {
-  auto a = mx::random::normal({10000, 1000});
-  mx::eval(a);
-  auto sum_all = [&a]() { return mx::sum(a, false); };
+  auto a = random::normal({10000, 1000});
+  eval(a);
+  auto sum_all = [&a]() { return sum(a, false); };
  TIME(sum_all);

-  auto sum_along_0 = [&a]() { return mx::sum(a, 0, false); };
+  auto sum_along_0 = [&a]() { return sum(a, 0, false); };
  TIME(sum_along_0);

-  auto sum_along_1 = [&a]() { return mx::sum(a, 1, false); };
+  auto sum_along_1 = [&a]() { return sum(a, 1, false); };
  TIME(sum_along_1);

-  auto prod_all = [&a]() { return mx::prod(a, false); };
+  auto prod_all = [&a]() { return prod(a, false); };
  TIME(prod_all);

-  auto all_true = [&a]() { return mx::all(a, false); };
+  auto all_true = [&a]() { return all(a, false); };
  TIME(all_true);

-  auto all_along_0 = [&a]() { return mx::all(a, 0, false); };
+  auto all_along_0 = [&a]() { return all(a, 0, false); };
  TIME(all_along_0);

-  auto all_along_1 = [&a]() { return mx::all(a, 1, false); };
+  auto all_along_1 = [&a]() { return all(a, 1, false); };
  TIME(all_along_1);

-  auto any_true = [&a]() { return mx::any(a, false); };
+  auto any_true = [&a]() { return any(a, false); };
  TIME(any_true);

-  auto argmin_along_0 = [&a]() { return mx::argmin(a, 0, false); };
+  auto argmin_along_0 = [&a]() { return argmin(a, 0, false); };
  TIME(argmin_along_0);

-  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
+  auto argmin_along_1 = [&a]() { return argmin(a, 1, false); };
  TIME(argmin_along_1);
 }

 void time_gather_scatter() {
-  auto a = mx::random::normal({1000, 768});
-  mx::eval(a);
-  auto indices = mx::random::randint(0, 1000, {256});
-  mx::eval(indices);
+  auto a = random::normal({1000, 768});
+  eval(a);
+  auto indices = random::randint(0, 1000, {256});
+  eval(indices);

-  auto embedding_lookup = [&a, &indices]() { return mx::take(a, indices, 0); };
+  auto embedding_lookup = [&a, &indices]() { return take(a, indices, 0); };
  TIME(embedding_lookup);

-  indices = mx::random::randint(0, 768 * 1000, {256 * 768});
-  mx::eval(indices);
+  indices = random::randint(0, 768 * 1000, {256 * 768});
+  eval(indices);

-  auto single_element_lookup = [&a, &indices]() {
-    return mx::take(a, indices);
-  };
+  auto single_element_lookup = [&a, &indices]() { return take(a, indices); };
  TIME(single_element_lookup);

-  indices = mx::random::randint(0, 1000, {256});
-  auto updates = mx::random::normal({256, 1, 768});
-  mx::eval(indices, updates);
+  indices = random::randint(0, 1000, {256});
+  auto updates = random::normal({256, 1, 768});
+  eval(indices, updates);

  auto embedding_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -225,10 +223,10 @@ void time_gather_scatter() {
  };
  TIME(embedding_add);

-  a = mx::reshape(a, {-1});
-  indices = mx::random::randint(0, 768 * 1000, {768 * 256});
-  updates = mx::random::normal({256 * 768, 1});
-  mx::eval(a, indices, updates);
+  a = reshape(a, {-1});
+  indices = random::randint(0, 768 * 1000, {768 * 256});
+  updates = random::normal({256 * 768, 1});
+  eval(a, indices, updates);

  auto single_element_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -242,21 +240,21 @@ void time_gather_scatter() {
 }

 void time_divmod() {
-  auto a = mx::random::normal({1000});
-  auto b = mx::random::normal({1000});
-  mx::eval({a, b});
+  auto a = random::normal({1000});
+  auto b = random::normal({1000});
+  eval({a, b});

-  auto divmod_fused = [&a, &b]() { return mx::divmod(a, b); };
+  auto divmod_fused = [&a, &b]() { return divmod(a, b); };
  TIME(divmod_fused);

  auto divmod_separate = [&a, &b]() {
-    return std::vector<mx::array>{mx::floor_divide(a, b), mx::remainder(a, b)};
+    return std::vector<array>{floor_divide(a, b), remainder(a, b)};
  };
  TIME(divmod_separate);
 }

 int main() {
-  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  std::cout << "Benchmarks for " << default_device() << std::endl;
  time_creation_ops();
  time_type_conversions();
  time_unary_ops();
--- a/benchmarks/python/sdpa_vector_bench.py
+++ b/benchmarks/python/sdpa_vector_bench.py
@@ -8,44 +8,30 @@ L = 16384
 H = 32
 H_k = H // 4
 D = 128
-V = 128
 dtype = mx.float16
 loops = 10


-def upproject(x, w):
-    if w is None:
-        return x
-    else:
-        return x @ w.T
-
-
-def attention(q, k, v, mask=None, w=None):
+def attention(q, k, v):
    def _sdpa(q, k, v):
        B, Hq, L, D = q.shape
        _, Hk, S, _ = k.shape
-        _, _, _, V = v.shape
        q = q.reshape(B, Hk, Hq // Hk, L, D)
        k = k[:, :, None, :, :]
        v = v[:, :, None, :, :]
        s = q @ k.transpose(0, 1, 2, 4, 3)
-        if mask is not None:
-            m = mx.broadcast_to(mask, (B, Hq, L, S)).reshape(B, Hk, Hq // Hk, L, S)
-            s = mx.where(m, s, mx.finfo(s.dtype).min)
        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
        o = p @ v
-        return o.reshape(B, Hq, L, V)
+        return o.reshape(B, Hq, L, D)

    for i in range(loops):
        q = _sdpa(q, k, v)
-        q = upproject(q, w)
    return q


-def sdpa(q, k, v, mask=None, w=None):
+def sdpa(q, k, v):
    for i in range(loops):
-        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
-        q = upproject(q, w)
+        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0)
    return q


@@ -53,43 +39,20 @@ def time_self_attention_primitives():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mx.eval(q, k, v, w)
-    time_fn(attention, q, k, v, w=w)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(attention, q, k, v)


 def time_self_attention_sdpa():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mx.eval(q, k, v, w)
-    time_fn(sdpa, q, k, v, w=w)
-
-
-def time_self_attention_sdpa_with_mask():
-    mx.random.seed(3)
-    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
-    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
-    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
-    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
-    mask = mx.full((L,), True)
-    mask[L // 2 :] = False
-    mx.eval(q, k, v, mask, w)
-
-    def sdpa_mask(*args):
-        return sdpa(*args, mask=mask, w=w)
-
-    def attention_mask(*args):
-        return attention(*args, mask=mask, w=w)
-
-    time_fn(attention_mask, q, k, v)
-    time_fn(sdpa_mask, q, k, v)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mx.eval(q, k, v)
+    time_fn(sdpa, q, k, v)


 if __name__ == "__main__":
    time_self_attention_sdpa()
    time_self_attention_primitives()
-    time_self_attention_sdpa_with_mask()
--- a/benchmarks/python/synchronize_bench.py
+++ b/benchmarks/python/synchronize_bench.py
@@ -1,55 +0,0 @@
-import time
-
-import mlx.core as mx
-
-rank = mx.distributed.init().rank()
-
-
-def timeit(fn, a):
-
-    # warmup
-    for _ in range(5):
-        mx.eval(fn(a))
-
-    its = 10
-    tic = time.perf_counter()
-    for _ in range(its):
-        mx.eval(fn(a))
-    toc = time.perf_counter()
-    ms = 1000 * (toc - tic) / its
-    return ms
-
-
-def all_reduce_benchmark():
-    a = mx.ones((5, 5), mx.int32)
-
-    its_per_eval = 100
-
-    def fn(x):
-        for _ in range(its_per_eval):
-            x = mx.distributed.all_sum(x)
-            x = x - 1
-        return x
-
-    ms = timeit(fn, a) / its_per_eval
-    if rank == 0:
-        print(f"All Reduce: time per iteration {ms:.6f} (ms)")
-
-
-def all_gather_benchmark():
-    a = mx.ones((5, 5), mx.int32)
-    its_per_eval = 100
-
-    def fn(x):
-        for _ in range(its_per_eval):
-            x = mx.distributed.all_gather(x)[0]
-        return x
-
-    ms = timeit(fn, a) / its_per_eval
-    if rank == 0:
-        print(f"All gather: time per iteration {ms:.6f} (ms)")
-
-
-if __name__ == "__main__":
-    all_reduce_benchmark()
-    all_gather_benchmark()
--- a/docs/src/dev/mlx_in_cpp.rst
+++ b/docs/src/dev/mlx_in_cpp.rst
@@ -1,121 +0,0 @@
-.. _mlx_in_cpp:
-
-Using MLX in C++
-================
-
-You can use MLX in a C++ project with CMake.
-
-.. note::
-
-  This guide is based one the following `example using MLX in C++ 
-  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_
-
-First install MLX:
-
-.. code-block:: bash
-
-  pip install -U mlx
-
-You can also install the MLX Python package from source or just the C++
-library. For more information see the :ref:`documentation on installing MLX
-<build_and_install>`.
-
-Next make an example program in ``example.cpp``: 
-
-.. code-block:: C++
-
-  #include <iostream>
-
-  #include "mlx/mlx.h"
-
-  namespace mx = mlx::core;
-
-  int main() {
-    auto x = mx::array({1, 2, 3});
-    auto y = mx::array({1, 2, 3});
-    std::cout << x + y << std::endl;
-    return 0;
-  }
-
-The next step is to setup a CMake file in ``CMakeLists.txt``:
-
-.. code-block:: cmake
-
-  cmake_minimum_required(VERSION 3.27)
-
-  project(example LANGUAGES CXX)
-
-  set(CMAKE_CXX_STANDARD 17)
-  set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-
-Depending on how you installed MLX, you may need to tell CMake where to
-find it. 
-
-If you installed MLX with Python, then add the following to the CMake file:
-
-.. code-block:: cmake
-
-  find_package(
-    Python 3.9
-    COMPONENTS Interpreter Development.Module
-    REQUIRED)
-  execute_process(
-    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    OUTPUT_VARIABLE MLX_ROOT)
-
-If you installed the MLX C++ package to a system path, then CMake should be
-able to find it. If you installed it to a non-standard location or CMake can't
-find MLX then set ``MLX_ROOT`` to the location where MLX is installed:
-
-.. code-block:: cmake
-
-  set(MLX_ROOT "/path/to/mlx/")
-
-Next, instruct CMake to find MLX:
-
-.. code-block:: cmake
-
-  find_package(MLX CONFIG REQUIRED)
-
-Finally, add the ``example.cpp`` program as an executable and link MLX.
-
-.. code-block:: cmake
-
-  add_executable(example example.cpp)
-  target_link_libraries(example PRIVATE mlx)
-
-You can build the example with:
-
-.. code-block:: bash
-
-  cmake -B build -DCMAKE_BUILD_TYPE=Release
-  cmake --build build
-
-And run it with:
-
-.. code-block:: bash
-
-  ./build/example
-
-Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:
-
-.. list-table:: Package Variables
-   :widths: 20 20 
-   :header-rows: 1
-
-   * - Variable 
-     - Description 
-   * - MLX_FOUND
-     - ``True`` if MLX is found
-   * - MLX_INCLUDE_DIRS
-     - Include directory
-   * - MLX_LIBRARIES
-     - Libraries to link against
-   * - MLX_CXX_FLAGS
-     - Additional compiler flags
-   * - MLX_BUILD_ACCELERATE
-     - ``True`` if MLX was built with Accelerate 
-   * - MLX_BUILD_METAL
-     - ``True`` if MLX was built with Metal
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -45,7 +45,6 @@ are the CPU and GPU.
   usage/numpy
   usage/distributed
   usage/using_streams
-   usage/export

 .. toctree::
   :caption: Examples
@@ -62,7 +61,6 @@ are the CPU and GPU.
   python/array
   python/data_types
   python/devices_and_streams
-   python/export
   python/ops
   python/random
   python/transforms
@@ -88,4 +86,3 @@ are the CPU and GPU.
   dev/extensions
   dev/metal_debugger
   dev/custom_metal_kernels
-   dev/mlx_in_cpp
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,5 +1,3 @@
-.. _build_and_install:
-
 Build and Install
 =================

@@ -55,7 +53,7 @@ Build Requirements
 ^^^^^^^^^^^^^^^^^^

 - A C++ compiler with C++17 support (e.g. Clang >= 5.0)
- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
+- `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
 - Xcode >= 15.0 and macOS SDK >= 14.0

 .. note::
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -66,4 +66,3 @@ documentation for more information. Use :func:`issubdtype` to determine if one
   Dtype
   DtypeCategory
   issubdtype
-   finfo
--- a/docs/src/python/export.rst
+++ b/docs/src/python/export.rst
@@ -1,14 +0,0 @@
-.. _export:
-
-Export Functions
-================
-
-.. currentmodule:: mlx.core
-
-.. autosummary::
-  :toctree: _autosummary
-
-   export_function
-   import_function
-   exporter
-   export_to_dot
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -89,7 +89,6 @@ Operations
   isneginf
   isposinf
   issubdtype
-   kron
   left_shift
   less
   less_equal
@@ -145,8 +144,6 @@ Operations
   sign
   sin
   sinh
-   slice
-   slice_update
   softmax
   sort
   split
@@ -171,7 +168,6 @@ Operations
   tri
   tril
   triu
-   unflatten
   var
   view
   where
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -421,77 +421,3 @@ the most opportunity to optimize the computation graph:
  # Compiling the outer function is good to do as it will likely
  # be faster even though the inner functions are compiled
  fun = mx.compile(outer)
-
-
-
-.. _shapeless_compile:
-
-Shapeless Compilation
---------------------
-
-When the shape of an input to a compiled function changes, the function is
-recompiled. You can compile a function once and run it on inputs with
-variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
-case changes to the shapes of the inputs do not cause the function to be
-recompiled.
-
-.. code-block:: python
-
-  def fun(x, y):
-      return mx.abs(x + y)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.array(1.0)
-  y = mx.array(-2.0)
-
-  # Firt call compiles the function
-  print(compiled_fun(x, y))
-
-  # Second call with different shapes
-  # does not recompile the function
-  x = mx.array([1.0, -6.0])
-  y = mx.array([-2.0, 3.0])
-  print(compiled_fun(x, y))
-
-
-Use shapeless compilations carefully. Since compilation is not triggered when
-shapes change, any graphs which are conditional on the input shapes will not
-work as expected. Shape-dependent computations are common and sometimes subtle
-to detect. For example:
-
-.. code-block:: python
-
-  def fun(x):
-      return x.reshape(x.shape[0] * x.shape[1], -1)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.random.uniform(shape=(2, 3, 4))
-
-  out = compiled_fun(x)
-
-  x = mx.random.uniform(shape=(5, 5, 3))
-
-  # Error, can't reshape (5, 5, 3) to (6, -1)
-  out = compiled_fun(x)
-
-The second call to the ``compiled_fun`` fails because of the call to
-:func:`reshape` which uses the static shape of ``x`` in the first call. We can
-fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:
-
-.. code-block:: python
-
-  def fun(x):
-      return x.flatten(0, 1)
-
-  compiled_fun = mx.compile(fun, shapeless=True)
-
-  x = mx.random.uniform(shape=(2, 3, 4))
-
-  out = compiled_fun(x)
-
-  x = mx.random.uniform(shape=(5, 5, 3))
-
-  # Ok
-  out = compiled_fun(x)
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -57,7 +57,7 @@ with the Anaconda package manager as follows:

 .. code:: shell

-    $ conda install conda-forge::openmpi
+    $ conda install openmpi

 Installing with Homebrew may require specifying the location of ``libmpi.dyld``
 so that MLX can find it and load it at runtime. This can simply be achieved by
@@ -141,13 +141,12 @@ everything else remaining the same.
    from mlx.utils import tree_map

    def all_reduce_grads(grads):
-        N = mx.distributed.init().size()
+        N = mx.distributed.init()
        if N == 1:
            return grads
        return tree_map(
-            lambda x: mx.distributed.all_sum(x) / N,
-            grads
-        )
+                lambda x: mx.distributed.all_sum(x) / N,
+                grads)

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -1,288 +0,0 @@
-.. _export_usage:
-
-Exporting Functions
-===================
-
-.. currentmodule:: mlx.core
-
-MLX has an API to export and import functions to and from a file. This lets you
-run computations written in one MLX front-end (e.g. Python) in another MLX
-front-end (e.g. C++). 
-
-This guide walks through the basics of the MLX export API with some examples.
-To see the full list of functions check-out the :ref:`API documentation
-<export>`.
-
-Basics of Exporting 
-------------------
-
-Let's start with a simple example:
- 
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-  mx.export_function("add.mlxfn", fun, x, y)
-
-To export a function, provide sample input arrays that the function
-can be called with. The data doesn't matter, but the shapes and types of the
-arrays do. In the above example we exported ``fun`` with two ``float32``
-scalar arrays. We can then import the function and run it:
-
-.. code-block:: python
-
-  add_fun = mx.import_function("add.mlxfn")
-
-  out, = add_fun(mx.array(1.0), mx.array(2.0))
-  # Prints: array(3, dtype=float32)
-  print(out)
-
-  out, = add_fun(mx.array(1.0), mx.array(3.0))
-  # Prints: array(4, dtype=float32)
-  print(out)
-
-  # Raises an exception
-  add_fun(mx.array(1), mx.array(3.0))
-
-  # Raises an exception
-  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))
-
-Notice the third and fourth calls to ``add_fun`` raise exceptions because the
-shapes and types of the inputs are different than the shapes and types of the
-example inputs we exported the function with.
-
-Also notice that even though the original ``fun`` returns a single output
-array, the imported function always returns a tuple of one or more arrays.
-
-The inputs to :func:`export_function` and to an imported function can be
-specified as variable positional arguments or as a tuple of arrays:
-
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-   
-  # Both arguments to fun are positional
-  mx.export_function("add.mlxfn", fun, x, y)
-
-  # Same as above
-  mx.export_function("add.mlxfn", fun, (x, y))
-
-  imported_fun = mx.import_function("add.mlxfn")
-
-  # Ok
-  out, = imported_fun(x, y)
-
-  # Also ok
-  out, = imported_fun((x, y))
-
-You can pass example inputs to functions as positional or keyword arguments. If
-you use keyword arguments to export the function, then you have to use the same
-keyword arguments when calling the imported function.
-
-.. code-block:: python
-
-  def fun(x, y):
-    return x + y
-
-  # One argument to fun is positional, the other is a kwarg
-  mx.export_function("add.mlxfn", fun, x, y=y)
-
-  imported_fun = mx.import_function("add.mlxfn")
-
-  # Ok
-  out, = imported_fun(x, y=y)
-
-  # Also ok
-  out, = imported_fun((x,), {"y": y})
-
-  # Raises since the keyword argument is missing
-  out, = imported_fun(x, y)
-
-  # Raises since the keyword argument has the wrong key
-  out, = imported_fun(x, z=y)
-
-
-Exporting Modules
-----------------
-
-An :obj:`mlx.nn.Module` can be exported with or without the parameters included
-in the exported function. Here's an example:
-
-.. code-block:: python
-
-   model = nn.Linear(4, 4)
-   mx.eval(model.parameters())
-
-   def call(x):
-      return model(x)
-
-   mx.export_function("model.mlxfn", call, mx.zeros(4))
-
-In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
-parameters are also saved to the ``model.mlxfn`` file.
-
-.. note::
-
-   For enclosed arrays inside an exported function, be extra careful to ensure
-   they are evaluated. The computation graph that gets exported will include
-   the computation that produces enclosed inputs.
-  
-   If the above example was missing ``mx.eval(model.parameters()``, the
-   exported function would include the random initialization of the
-   :obj:`mlx.nn.Module` parameters.
-
-If you only want to export the ``Module.__call__`` function without the
-parameters, pass them as inputs to the ``call`` wrapper:
-
-.. code-block:: python
-
-   model = nn.Linear(4, 4)
-   mx.eval(model.parameters())
-
-   def call(x, **params):
-     # Set the model's parameters to the input parameters
-     model.update(tree_unflatten(list(params.items())))
-     return model(x)
- 
-   params = dict(tree_flatten(model.parameters()))
-   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)
-
-
-Shapeless Exports
-----------------
-
-Just like :func:`compile`, functions can also be exported for dynamically shaped
-inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
-to export a function which can be used for inputs with variable shapes:
-
-.. code-block:: python
-
-  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
-  imported_abs = mx.import_function("fun.mlxfn")
-
-  # Ok
-  out, = imported_abs(mx.array(-1.0))
-  
-  # Also ok 
-  out, = imported_abs(mx.array([-1.0, -2.0]))
-
-With ``shapeless=False`` (which is the default), the second call to
-``imported_abs`` would raise an exception with a shape mismatch.
-
-Shapeless exporting works the same as shapeless compilation and should be
-used carefully. See the :ref:`documentation on shapeless compilation
-<shapeless_compile>` for more information.
-
-Exporting Multiple Traces
-------------------------
-
-In some cases, functions build different computation graphs for different
-input arguments. A simple way to manage this is to export to a new file with
-each set of inputs. This is a fine option in many cases. But it can be
-suboptimal if the exported functions have a large amount of duplicate constant
-data (for example the parameters of a :obj:`mlx.nn.Module`).
-
-The export API in MLX lets you export multiple traces of the same function to
-a single file by creating an exporting context manager with :func:`exporter`:
-
-.. code-block:: python
-
-  def fun(x, y=None):
-      constant = mx.array(3.0)
-      if y is not None:
-        x += y 
-      return x + constant
-
-  with mx.exporter("fun.mlxfn", fun) as exporter:
-      exporter(mx.array(1.0))
-      exporter(mx.array(1.0), y=mx.array(0.0))
-
-  imported_function = mx.import_function("fun.mlxfn")
-
-  # Call the function with y=None
-  out, = imported_function(mx.array(1.0))
-  print(out)
-
-  # Call the function with y specified
-  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
-  print(out)
-
-In the above example the function constant data, (i.e. ``constant``), is only
-saved once. 
-
-Transformations with Imported Functions
---------------------------------------
-
-Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
-on imported functions just like regular Python functions:
-
-.. code-block:: python
-
-  def fun(x):
-      return mx.sin(x)
-
-  x = mx.array(0.0)
-  mx.export_function("sine.mlxfn", fun, x)
-
-  imported_fun = mx.import_function("sine.mlxfn")
-
-  # Take the derivative of the imported function
-  dfdx = mx.grad(lambda x: imported_fun(x)[0])
-  # Prints: array(1, dtype=float32)
-  print(dfdx(x))
-
-  # Compile the imported function 
-  mx.compile(imported_fun)
-  # Prints: array(0, dtype=float32)
-  print(compiled_fun(x)[0])
-
-
-Importing Functions in C++
--------------------------
-
-Importing and running functions in C++ is basically the same as importing and
-running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
-setup a simple C++ project that uses MLX as a library.
-
-Next, export a simple function from Python:
-
-.. code-block:: python
-
-  def fun(x, y):
-      return mx.exp(x + y)
-
-  x = mx.array(1.0)
-  y = mx.array(1.0)
-  mx.export_function("fun.mlxfn", fun, x, y)
-
-
-Import and run the function in C++ with only a few lines of code:
-
-.. code-block:: c++
-
-  auto fun = mx::import_function("fun.mlxfn");
-
-  auto inputs = {mx::array(1.0), mx::array(1.0)};
-  auto outputs = fun(inputs);
-
-  // Prints: array(2, dtype=float32)
-  std::cout << outputs[0] << std::endl;
-
-Imported functions can be transformed in C++ just like in Python. Use 
-``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
-mx::array>`` for keyword arguments when calling imported functions in C++.
-
-More Examples
-------------
-
-Here are a few more complete examples exporting more complex functions from
-Python and importing and running them in C++:
-
-* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_
--- a/examples/cmake_project/CMakeLists.txt
+++ b/examples/cmake_project/CMakeLists.txt
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.27)
-
-project(example LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Comment the following two commands only the MLX C++ library is installed and
-# set(MLX_ROOT "/path/to/mlx") directly if needed.
-find_package(
-  Python 3.9
-  COMPONENTS Interpreter Development.Module
-  REQUIRED)
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE MLX_ROOT)
-
-find_package(MLX CONFIG REQUIRED)
-
-add_executable(example example.cpp)
-target_link_libraries(example PRIVATE mlx)
--- a/examples/cmake_project/README.md
+++ b/examples/cmake_project/README.md
@@ -1,26 +0,0 @@
-## Build and Run 
-
-Install MLX with Python:
-
-```bash
-pip install mlx>=0.22
-```
-
-Build the C++ example:
-
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build
-```
-
-Run the C++ example:
-
-```
-./build/example
-```
-
-which should output:
-
-```
-array([2, 4, 6], dtype=int32)
-```
--- a/examples/cmake_project/example.cpp
+++ b/examples/cmake_project/example.cpp
@@ -1,14 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <iostream>
-
-#include "mlx/mlx.h"
-
-namespace mx = mlx::core;
-
-int main() {
-  auto x = mx::array({1, 2, 3});
-  auto y = mx::array({1, 2, 3});
-  std::cout << x + y << std::endl;
-  return 0;
-}
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -4,19 +4,19 @@

 #include "mlx/mlx.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 int main() {
-  if (!mx::distributed::is_available()) {
+  if (!distributed::is_available()) {
    std::cout << "No communication backend found" << std::endl;
    return 1;
  }

-  auto global_group = mx::distributed::init();
+  auto global_group = distributed::init();
  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;

-  mx::array x = mx::ones({10});
-  mx::array out = mx::distributed::all_sum(x, global_group);
+  array x = ones({10});
+  array out = distributed::all_sum(x, global_group);

  std::cout << out << std::endl;
 }
--- a/examples/cpp/linear_regression.cpp
+++ b/examples/cpp/linear_regression.cpp
@@ -10,7 +10,7 @@
 /**
 * An example of linear regression with MLX.
 */
-namespace mx = mlx::core;
+using namespace mlx::core;

 int main() {
  int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
  float learning_rate = 0.01;

  // True parameters
-  auto w_star = mx::random::normal({num_features});
+  auto w_star = random::normal({num_features});

  // The input examples (design matrix)
-  auto X = mx::random::normal({num_examples, num_features});
+  auto X = random::normal({num_examples, num_features});

  // Noisy labels
-  auto eps = 1e-2 * mx::random::normal({num_examples});
-  auto y = mx::matmul(X, w_star) + eps;
+  auto eps = 1e-2 * random::normal({num_examples});
+  auto y = matmul(X, w_star) + eps;

  // Initialize random parameters
-  mx::array w = 1e-2 * mx::random::normal({num_features});
+  array w = 1e-2 * random::normal({num_features});

-  auto loss_fn = [&](mx::array w) {
-    auto yhat = mx::matmul(X, w);
-    return (0.5f / num_examples) * mx::sum(mx::square(yhat - y));
+  auto loss_fn = [&](array w) {
+    auto yhat = matmul(X, w);
+    return (0.5f / num_examples) * sum(square(yhat - y));
  };

-  auto grad_fn = mx::grad(loss_fn);
+  auto grad_fn = grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
-    auto grads = grad_fn(w);
-    w = w - learning_rate * grads;
-    mx::eval(w);
+    auto grad = grad_fn(w);
+    w = w - learning_rate * grad;
+    eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
-  auto error_norm = std::sqrt(mx::sum(mx::square(w - w_star)).item<float>());
+  auto error_norm = std::sqrt(sum(square(w - w_star)).item<float>());
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", |w - w*| = " << error_norm
            << ", Throughput " << throughput << " (it/s)." << std::endl;
--- a/examples/cpp/logistic_regression.cpp
+++ b/examples/cpp/logistic_regression.cpp
@@ -10,7 +10,7 @@
 /**
 * An example of logistic regression with MLX.
 */
-namespace mx = mlx::core;
+using namespace mlx::core;

 int main() {
  int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
  float learning_rate = 0.1;

  // True parameters
-  auto w_star = mx::random::normal({num_features});
+  auto w_star = random::normal({num_features});

  // The input examples
-  auto X = mx::random::normal({num_examples, num_features});
+  auto X = random::normal({num_examples, num_features});

  // Labels
-  auto y = mx::matmul(X, w_star) > 0;
+  auto y = matmul(X, w_star) > 0;

  // Initialize random parameters
-  mx::array w = 1e-2 * mx::random::normal({num_features});
+  array w = 1e-2 * random::normal({num_features});

-  auto loss_fn = [&](mx::array w) {
-    auto logits = mx::matmul(X, w);
+  auto loss_fn = [&](array w) {
+    auto logits = matmul(X, w);
    auto scale = (1.0f / num_examples);
-    return scale * mx::sum(mx::logaddexp(mx::array(0.0f), logits) - y * logits);
+    return scale * sum(logaddexp(array(0.0f), logits) - y * logits);
  };

-  auto grad_fn = mx::grad(loss_fn);
+  auto grad_fn = grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
-    auto grads = grad_fn(w);
-    w = w - learning_rate * grads;
-    mx::eval(w);
+    auto grad = grad_fn(w);
+    w = w - learning_rate * grad;
+    eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
-  auto acc = mx::sum((mx::matmul(X, w) > 0) == y) / num_examples;
+  auto acc = sum((matmul(X, w) > 0) == y) / num_examples;
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", Accuracy, " << acc << ", Throughput "
            << throughput << " (it/s)." << std::endl;
--- a/examples/cpp/metal_capture.cpp
+++ b/examples/cpp/metal_capture.cpp
@@ -5,27 +5,27 @@

 #include "mlx/mlx.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 int main() {
  // To use Metal debugging and profiling:
  // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
  // 2. Run with MTL_CAPTURE_ENABLED=1.
-  mx::metal::start_capture("mlx_trace.gputrace");
+  metal::start_capture("mlx_trace.gputrace");

  // Start at index two because the default GPU and CPU streams have indices
  // zero and one, respectively. This naming matches the label assigned to each
  // stream's command queue.
-  auto s2 = new_stream(mx::Device::gpu);
-  auto s3 = new_stream(mx::Device::gpu);
+  auto s2 = new_stream(Device::gpu);
+  auto s3 = new_stream(Device::gpu);

-  auto a = mx::arange(1.f, 10.f, 1.f, mx::float32, s2);
-  auto b = mx::arange(1.f, 10.f, 1.f, mx::float32, s3);
-  auto x = mx::add(a, a, s2);
-  auto y = mx::add(b, b, s3);
+  auto a = arange(1.f, 10.f, 1.f, float32, s2);
+  auto b = arange(1.f, 10.f, 1.f, float32, s3);
+  auto x = add(a, a, s2);
+  auto y = add(b, b, s3);

  // The multiply will happen on the default stream.
-  std::cout << mx::multiply(x, y) << std::endl;
+  std::cout << multiply(x, y) << std::endl;

-  mx::metal::stop_capture();
+  metal::stop_capture();
 }
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -5,11 +5,11 @@

 #include "mlx/mlx.h"

-namespace mx = mlx::core;
+using namespace mlx::core;

 void array_basics() {
  // Make a scalar array:
-  mx::array x(1.0);
+  array x(1.0);

  // Get the value out of it:
  auto s = x.item<float>();
@@ -29,31 +29,31 @@ void array_basics() {

  // The datatype should be float32:
  auto dtype = x.dtype();
-  assert(dtype == mx::float32);
+  assert(dtype == float32);

  // Specify the dtype when constructing the array:
-  x = mx::array(1, mx::int32);
-  assert(x.dtype() == mx::int32);
+  x = array(1, int32);
+  assert(x.dtype() == int32);
  x.item<int>(); // OK
  // x.item<float>();  // Undefined!

  // Make a multidimensional array:
-  x = mx::array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  x = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  // mlx is row-major by default so the first row of this array
  // is [1.0, 2.0] and the second row is [3.0, 4.0]

  // Make an array of shape {2, 2} filled with ones:
-  auto y = mx::ones({2, 2});
+  auto y = ones({2, 2});

  // Pointwise add x and y:
-  auto z = mx::add(x, y);
+  auto z = add(x, y);

  // Same thing:
  z = x + y;

  // mlx is lazy by default. At this point `z` only
  // has a shape and a type but no actual data:
-  assert(z.dtype() == mx::float32);
+  assert(z.dtype() == float32);
  assert(z.shape(0) == 2);
  assert(z.shape(1) == 2);

@@ -63,33 +63,33 @@ void array_basics() {
  // and inputs. When `eval` is called on an array (or arrays), the array and
  // all of its dependencies are recursively evaluated to produce the result.
  // Once an array is evaluated, it has data and is detached from its inputs.
-  mx::eval(z);
+  eval(z);

-  // Of course the array can still be an input to other operations. You can
-  // even call eval on the array again, this will just be a no-op:
-  mx::eval(z); // no-op
+  // Of course the array can still be an input to other operations. You can even
+  // call eval on the array again, this will just be a no-op:
+  eval(z); // no-op

  // Some functions or methods on arrays implicitly evaluate them. For example
  // accessing a value in an array or printing the array implicitly evaluate it:
-  z = mx::ones({1});
+  z = ones({1});
  z.item<float>(); // implicit evaluation

-  z = mx::ones({2, 2});
+  z = ones({2, 2});
  std::cout << z << std::endl; // implicit evaluation
 }

 void automatic_differentiation() {
-  auto fn = [](mx::array x) { return mx::square(x); };
+  auto fn = [](array x) { return square(x); };

  // Computing the derivative function of a function
-  auto grad_fn = mx::grad(fn);
+  auto grad_fn = grad(fn);
  // Call grad_fn on the input to get the derivative
-  auto x = mx::array(1.5);
+  auto x = array(1.5);
  auto dfdx = grad_fn(x);
  // dfdx is 2 * x

  // Get the second derivative by composing grad with grad
-  auto d2fdx2 = mx::grad(mx::grad(fn))(x);
+  auto d2fdx2 = grad(grad(fn))(x);
  // d2fdx2 is 2
 }

--- a/examples/export/CMakeLists.txt
+++ b/examples/export/CMakeLists.txt
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.27)
-
-project(import_mlx LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-find_package(
-  Python 3.9
-  COMPONENTS Interpreter Development.Module
-  REQUIRED)
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE MLX_ROOT)
-find_package(MLX CONFIG REQUIRED)
-
-add_executable(eval_mlp eval_mlp.cpp)
-target_link_libraries(eval_mlp PRIVATE mlx)
-
-add_executable(train_mlp train_mlp.cpp)
-target_link_libraries(train_mlp PRIVATE mlx)
--- a/examples/export/README.md
+++ b/examples/export/README.md
@@ -1,49 +0,0 @@
-## Setup
-
-Install MLX:
-
-```bash
-pip install mlx>=0.22
-```
-
-Build the C++ examples:
-
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build
-```
-
-## Run
-
-### Eval MLP
-
-Run the Python script to export the eval function:
-
-```bash
-python eval_mlp.py
-```
-
-Then run the C++ program to import and run the function:
-
-```
-./build/eval_mlp
-```
-
-The Python and C++ programs should output the same result.
-
-### Train MLP
-
-Run the Python script to export the model initialization and training
-functions:
-
-```bash
-python train_mlp.py
-```
-
-Then run the C++ program to import and run the functions:
-
-```
-./build/train_mlp
-```
-
-The Python and C++ programs should output the same results.
--- a/examples/export/eval_mlp.cpp
+++ b/examples/export/eval_mlp.cpp
@@ -1,25 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <mlx/mlx.h>
-#include <iostream>
-
-namespace mx = mlx::core;
-
-int main() {
-  int batch_size = 8;
-  int input_dim = 32;
-
-  // Make the input
-  mx::random::seed(42);
-  auto example_x = mx::random::uniform({batch_size, input_dim});
-
-  // Import the function
-  auto forward = mx::import_function("eval_mlp.mlxfn");
-
-  // Call the imported function
-  auto out = forward({example_x})[0];
-
-  std::cout << out << std::endl;
-
-  return 0;
-}
--- a/examples/export/eval_mlp.py
+++ b/examples/export/eval_mlp.py
@@ -1,52 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.utils
-
-
-class MLP(nn.Module):
-    """A simple MLP."""
-
-    def __init__(
-        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
-    ):
-        super().__init__()
-        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
-        self.layers = [
-            nn.Linear(idim, odim)
-            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
-        ]
-
-    def __call__(self, x):
-        for l in self.layers[:-1]:
-            x = nn.relu(l(x))
-        return self.layers[-1](x)
-
-
-if __name__ == "__main__":
-
-    batch_size = 8
-    input_dim = 32
-    output_dim = 10
-
-    # Load the model
-    mx.random.seed(0)  # Seed for params
-    model = MLP(num_layers=5, input_dim=input_dim, hidden_dim=64, output_dim=output_dim)
-    mx.eval(model)
-
-    # Note, the model parameters are saved in the export function
-    def forward(x):
-        return model(x)
-
-    mx.random.seed(42)  # Seed for input
-    example_x = mx.random.uniform(shape=(batch_size, input_dim))
-
-    mx.export_function("eval_mlp.mlxfn", forward, example_x)
-
-    # Import in Python
-    imported_forward = mx.import_function("eval_mlp.mlxfn")
-    expected = forward(example_x)
-    (out,) = imported_forward(example_x)
-    assert mx.allclose(expected, out)
-    print(out)
--- a/examples/export/train_mlp.cpp
+++ b/examples/export/train_mlp.cpp
@@ -1,35 +0,0 @@
-// Copyright © 2024 Apple Inc.
-
-#include <mlx/mlx.h>
-#include <iostream>
-
-namespace mx = mlx::core;
-
-int main() {
-  int batch_size = 8;
-  int input_dim = 32;
-  int output_dim = 10;
-
-  auto state = mx::import_function("init_mlp.mlxfn")({});
-
-  // Make the input
-  mx::random::seed(42);
-  auto example_X = mx::random::normal({batch_size, input_dim});
-  auto example_y = mx::random::randint(0, output_dim, {batch_size});
-
-  // Import the function
-  auto step = mx::import_function("train_mlp.mlxfn");
-
-  // Call the imported function
-  for (int it = 0; it < 100; ++it) {
-    state.insert(state.end(), {example_X, example_y});
-    state = step(state);
-    eval(state);
-    auto loss = state.back();
-    state.pop_back();
-    if (it % 10 == 0) {
-      std::cout << "Loss " << loss.item<float>() << std::endl;
-    }
-  }
-  return 0;
-}
--- a/examples/export/train_mlp.py
+++ b/examples/export/train_mlp.py
@@ -1,76 +0,0 @@
-# Copyright © 2024 Apple Inc.
-
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.optimizers as optim
-import mlx.utils
-
-
-class MLP(nn.Module):
-    """A simple MLP."""
-
-    def __init__(
-        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
-    ):
-        super().__init__()
-        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
-        self.layers = [
-            nn.Linear(idim, odim)
-            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
-        ]
-
-    def __call__(self, x):
-        for l in self.layers[:-1]:
-            x = nn.relu(l(x))
-        return self.layers[-1](x)
-
-
-if __name__ == "__main__":
-
-    batch_size = 8
-    input_dim = 32
-    output_dim = 10
-
-    def init():
-        # Seed for the parameter initialization
-        mx.random.seed(0)
-        model = MLP(
-            num_layers=3, input_dim=input_dim, hidden_dim=64, output_dim=output_dim
-        )
-        optimizer = optim.SGD(learning_rate=1e-1)
-        optimizer.init(model.parameters())
-        state = [model.parameters(), optimizer.state]
-        tree_structure, state = zip(*mlx.utils.tree_flatten(state))
-        return model, optimizer, tree_structure, state
-
-    # Export the model parameter initialization
-    model, optimizer, tree_structure, state = init()
-    mx.eval(state)
-    mx.export_function("init_mlp.mlxfn", lambda: init()[-1])
-
-    def loss_fn(params, X, y):
-        model.update(params)
-        return nn.losses.cross_entropy(model(X), y, reduction="mean")
-
-    def step(*inputs):
-        *state, X, y = inputs
-        params, opt_state = mlx.utils.tree_unflatten(list(zip(tree_structure, state)))
-        optimizer.state = opt_state
-        loss, grads = mx.value_and_grad(loss_fn)(params, X, y)
-        params = optimizer.apply_gradients(grads, params)
-        _, state = zip(*mlx.utils.tree_flatten([params, optimizer.state]))
-        return *state, loss
-
-    # Make some random data
-    mx.random.seed(42)
-    example_X = mx.random.normal(shape=(batch_size, input_dim))
-    example_y = mx.random.randint(low=0, high=output_dim, shape=(batch_size,))
-    mx.export_function("train_mlp.mlxfn", step, *state, example_X, example_y)
-
-    # Export one step of SGD
-    imported_step = mx.import_function("train_mlp.mlxfn")
-
-    for it in range(100):
-        *state, loss = imported_step(*state, example_X, example_y)
-        if it % 10 == 0:
-            print(f"Loss {loss.item():.6}")
--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -18,7 +18,8 @@ find_package(
 execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE nanobind_ROOT)
+  OUTPUT_VARIABLE NB_DIR)
+list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
 find_package(nanobind CONFIG REQUIRED)

 # ----------------------------- Extensions -----------------------------
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -6,7 +6,6 @@

 #include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
 #include "mlx/utils.h"

 #include "axpby/axpby.h"
@@ -20,7 +19,7 @@
 #include "mlx/backend/metal/utils.h"
 #endif

-namespace my_ext {
+namespace mlx::core {

 ///////////////////////////////////////////////////////////////////////////////
 // Operation Implementation
@@ -33,24 +32,24 @@ namespace my_ext {
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
-mx::array axpby(
-    const mx::array& x, // Input mx::array x
-    const mx::array& y, // Input mx::array y
+array axpby(
+    const array& x, // Input array x
+    const array& y, // Input array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
-    mx::StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
+    StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
 ) {
  // Promote dtypes between x and y as needed
  auto promoted_dtype = promote_types(x.dtype(), y.dtype());

  // Upcast to float32 for non-floating point inputs x and y
-  auto out_dtype = mx::issubdtype(promoted_dtype, mx::float32)
+  auto out_dtype = issubdtype(promoted_dtype, float32)
      ? promoted_dtype
-      : promote_types(promoted_dtype, mx::float32);
+      : promote_types(promoted_dtype, float32);

  // Cast x and y up to the determined dtype (on the same stream s)
-  auto x_casted = mx::astype(x, out_dtype, s);
-  auto y_casted = mx::astype(y, out_dtype, s);
+  auto x_casted = astype(x, out_dtype, s);
+  auto y_casted = astype(y, out_dtype, s);

  // Broadcast the shapes of x and y (on the same stream s)
  auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
@@ -58,12 +57,12 @@ mx::array axpby(

  // Construct the array as the output of the Axpby primitive
  // with the broadcasted and upcasted arrays as inputs
-  return mx::array(
-      /* const mx::Shape& shape = */ out_shape,
-      /* mx::Dtype dtype = */ out_dtype,
-      /* std::shared_ptr<mx::Primitive> primitive = */
+  return array(
+      /* const std::vector<int>& shape = */ out_shape,
+      /* Dtype dtype = */ out_dtype,
+      /* std::unique_ptr<Primitive> primitive = */
      std::make_shared<Axpby>(to_stream(s), alpha, beta),
-      /* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
+      /* const std::vector<array>& inputs = */ broadcasted_inputs);
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -72,16 +71,16 @@ mx::array axpby(

 template <typename T>
 void axpby_impl(
-    const mx::array& x,
-    const mx::array& y,
-    mx::array& out,
+    const array& x,
+    const array& y,
+    array& out,
    float alpha_,
    float beta_) {
  // We only allocate memory when we are ready to fill the output
  // malloc_or_wait synchronously allocates available memory
  // There may be a wait executed here if the allocation is requested
  // under memory-pressured conditions
-  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  // Collect input and output data pointers
  const T* x_ptr = x.data<T>();
@@ -95,8 +94,8 @@ void axpby_impl(
  // Do the element-wise operation for each output
  for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
    // Map linear indices to offsets in x and y
-    auto x_offset = mx::elem_to_loc(out_idx, x.shape(), x.strides());
-    auto y_offset = mx::elem_to_loc(out_idx, y.shape(), y.strides());
+    auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
+    auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());

    // We allocate the output to be contiguous and regularly strided
    // (defaults to row major) and hence it doesn't need additional mapping
@@ -106,8 +105,8 @@ void axpby_impl(

 /** Fall back implementation for evaluation on CPU */
 void Axpby::eval(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
  // Check the inputs (registered in the op while constructing the out array)
  assert(inputs.size() == 2);
  auto& x = inputs[0];
@@ -115,14 +114,14 @@ void Axpby::eval(
  auto& out = outputs[0];

  // Dispatch to the correct dtype
-  if (out.dtype() == mx::float32) {
+  if (out.dtype() == float32) {
    return axpby_impl<float>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == mx::float16) {
-    return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == mx::bfloat16) {
-    return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == mx::complex64) {
-    return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == float16) {
+    return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == bfloat16) {
+    return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == complex64) {
+    return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
  } else {
    throw std::runtime_error(
        "Axpby is only supported for floating point types.");
@@ -137,9 +136,9 @@ void Axpby::eval(

 template <typename T>
 void axpby_impl_accelerate(
-    const mx::array& x,
-    const mx::array& y,
-    mx::array& out,
+    const array& x,
+    const array& y,
+    array& out,
    float alpha_,
    float beta_) {
  // Accelerate library provides catlas_saxpby which does
@@ -151,10 +150,10 @@ void axpby_impl_accelerate(
  // The data in the output array is allocated to match the strides in y
  // such that x, y, and out are contiguous in the same mode and
  // no transposition is needed
-  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  // We then copy over the elements using the contiguous vector specialization
-  copy_inplace(y, out, mx::CopyType::Vector);
+  copy_inplace(y, out, CopyType::Vector);

  // Get x and y pointers for catlas_saxpby
  const T* x_ptr = x.data<T>();
@@ -176,15 +175,15 @@ void axpby_impl_accelerate(

 /** Evaluate primitive on CPU using accelerate specializations */
 void Axpby::eval_cpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];

  // Accelerate specialization for contiguous single precision float arrays
-  if (out.dtype() == mx::float32 &&
+  if (out.dtype() == float32 &&
      ((x.flags().row_contiguous && y.flags().row_contiguous) ||
       (x.flags().col_contiguous && y.flags().col_contiguous))) {
    axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
@@ -199,8 +198,8 @@ void Axpby::eval_cpu(

 /** Evaluate primitive on CPU falling back to common backend */
 void Axpby::eval_cpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
+    const std::vector<array>& inputs,
+    const std::vector<array>& outputs) {
  eval(inputs, outputs);
 }

@@ -214,8 +213,8 @@ void Axpby::eval_cpu(

 /** Evaluate primitive on GPU */
 void Axpby::eval_gpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& outputs) {
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs) {
  // Prepare inputs
  assert(inputs.size() == 2);
  auto& x = inputs[0];
@@ -226,7 +225,7 @@ void Axpby::eval_gpu(
  // and each stream carries its device identifiers
  auto& s = stream();
  // We get the needed metal device using the stream
-  auto& d = mx::metal::device(s.device);
+  auto& d = metal::device(s.device);

  // Prepare to specialize based on contiguity
  bool contiguous_kernel =
@@ -236,12 +235,12 @@ void Axpby::eval_gpu(
  // Allocate output memory with strides based on specialization
  if (contiguous_kernel) {
    out.set_data(
-        mx::allocator::malloc_or_wait(x.data_size() * out.itemsize()),
+        allocator::malloc_or_wait(x.data_size() * out.itemsize()),
        x.data_size(),
        x.strides(),
        x.flags());
  } else {
-    out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
  }

  // Resolve name of kernel (corresponds to axpby.metal)
@@ -280,7 +279,7 @@ void Axpby::eval_gpu(
  if (!contiguous_kernel) {
    compute_encoder.set_vector_bytes(x.shape(), 5);
    compute_encoder.set_vector_bytes(x.strides(), 6);
-    compute_encoder.set_vector_bytes(y.strides(), 7);
+    compute_encoder.set_bytes(y.strides(), 7);
    compute_encoder.set_bytes(ndim, 8);
  }

@@ -303,8 +302,8 @@ void Axpby::eval_gpu(

 /** Fail evaluation on GPU */
 void Axpby::eval_gpu(
-    const std::vector<mx::array>& inputs,
-    std::vector<mx::array>& out) {
+    const std::vector<array>& inputs,
+    std::vector<array>& out) {
  throw std::runtime_error("Axpby has no GPU implementation.");
 }

@@ -315,9 +314,9 @@ void Axpby::eval_gpu(
 ///////////////////////////////////////////////////////////////////////////////

 /** The Jacobian-vector product. */
-std::vector<mx::array> Axpby::jvp(
-    const std::vector<mx::array>& primals,
-    const std::vector<mx::array>& tangents,
+std::vector<array> Axpby::jvp(
+    const std::vector<array>& primals,
+    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  // Forward mode diff that pushes along the tangents
  // The jvp transform on the primitive can built with ops
@@ -329,8 +328,8 @@ std::vector<mx::array> Axpby::jvp(
  // scaled by beta
  if (argnums.size() > 1) {
    auto scale = argnums[0] == 0 ? alpha_ : beta_;
-    auto scale_arr = mx::array(scale, tangents[0].dtype());
-    return {mx::multiply(scale_arr, tangents[0], stream())};
+    auto scale_arr = array(scale, tangents[0].dtype());
+    return {multiply(scale_arr, tangents[0], stream())};
  }
  // If, argnums = {0, 1}, we take contributions from both
  // which gives us jvp = tangent_x * alpha + tangent_y * beta
@@ -340,24 +339,24 @@ std::vector<mx::array> Axpby::jvp(
 }

 /** The vector-Jacobian product. */
-std::vector<mx::array> Axpby::vjp(
-    const std::vector<mx::array>& primals,
-    const std::vector<mx::array>& cotangents,
+std::vector<array> Axpby::vjp(
+    const std::vector<array>& primals,
+    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
-    const std::vector<mx::array>&) {
+    const std::vector<array>&) {
  // Reverse mode diff
-  std::vector<mx::array> vjps;
+  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto scale = arg == 0 ? alpha_ : beta_;
-    auto scale_arr = mx::array(scale, cotangents[0].dtype());
-    vjps.push_back(mx::multiply(scale_arr, cotangents[0], stream()));
+    auto scale_arr = array(scale, cotangents[0].dtype());
+    vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
  }
  return vjps;
 }

 /** Vectorize primitive along given axis */
-std::pair<std::vector<mx::array>, std::vector<int>> Axpby::vmap(
-    const std::vector<mx::array>& inputs,
+std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
+    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Axpby has no vmap implementation.");
 }
@@ -368,4 +367,4 @@ bool Axpby::is_equivalent(const Primitive& other) const {
  return alpha_ == r_other.alpha_ && beta_ == r_other.beta_;
 }

-} // namespace my_ext
+} // namespace mlx::core
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -5,9 +5,7 @@
 #include "mlx/ops.h"
 #include "mlx/primitives.h"

-namespace mx = mlx::core;
-
-namespace my_ext {
+namespace mlx::core {

 ///////////////////////////////////////////////////////////////////////////////
 // Operation
@@ -20,22 +18,22 @@ namespace my_ext {
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
-mx::array axpby(
-    const mx::array& x, // Input array x
-    const mx::array& y, // Input array y
+array axpby(
+    const array& x, // Input array x
+    const array& y, // Input array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
-    mx::StreamOrDevice s = {} // Stream on which to schedule the operation
+    StreamOrDevice s = {} // Stream on which to schedule the operation
 );

 ///////////////////////////////////////////////////////////////////////////////
 // Primitive
 ///////////////////////////////////////////////////////////////////////////////

-class Axpby : public mx::Primitive {
+class Axpby : public Primitive {
 public:
-  explicit Axpby(mx::Stream stream, float alpha, float beta)
-      : mx::Primitive(stream), alpha_(alpha), beta_(beta) {};
+  explicit Axpby(Stream stream, float alpha, float beta)
+      : Primitive(stream), alpha_(alpha), beta_(beta) {};

  /**
   * A primitive must know how to evaluate itself on the CPU/GPU
@@ -44,25 +42,23 @@ class Axpby : public mx::Primitive {
   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
-  void eval_cpu(
-      const std::vector<mx::array>& inputs,
-      std::vector<mx::array>& outputs) override;
-  void eval_gpu(
-      const std::vector<mx::array>& inputs,
-      std::vector<mx::array>& outputs) override;
+  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
+      override;
+  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
+      override;

  /** The Jacobian-vector product. */
-  std::vector<mx::array> jvp(
-      const std::vector<mx::array>& primals,
-      const std::vector<mx::array>& tangents,
+  std::vector<array> jvp(
+      const std::vector<array>& primals,
+      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;

  /** The vector-Jacobian product. */
-  std::vector<mx::array> vjp(
-      const std::vector<mx::array>& primals,
-      const std::vector<mx::array>& cotangents,
+  std::vector<array> vjp(
+      const std::vector<array>& primals,
+      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
-      const std::vector<mx::array>& outputs) override;
+      const std::vector<array>& outputs) override;

  /**
   * The primitive must know how to vectorize itself across
@@ -70,8 +66,8 @@ class Axpby : public mx::Primitive {
   * representing the vectorized computation and the axis which
   * corresponds to the output vectorized dimension.
   */
-  std::pair<std::vector<mx::array>, std::vector<int>> vmap(
-      const std::vector<mx::array>& inputs,
+  std::pair<std::vector<array>, std::vector<int>> vmap(
+      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;

  /** Print the primitive. */
@@ -80,16 +76,14 @@ class Axpby : public mx::Primitive {
  }

  /** Equivalence check **/
-  bool is_equivalent(const mx::Primitive& other) const override;
+  bool is_equivalent(const Primitive& other) const override;

 private:
  float alpha_;
  float beta_;

  /** Fall back implementation for evaluation on CPU */
-  void eval(
-      const std::vector<mx::array>& inputs,
-      std::vector<mx::array>& outputs);
+  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
 };

-} // namespace my_ext
+} // namespace mlx::core
--- a/examples/extensions/bindings.cpp
+++ b/examples/extensions/bindings.cpp
@@ -8,12 +8,14 @@
 namespace nb = nanobind;
 using namespace nb::literals;

+using namespace mlx::core;
+
 NB_MODULE(_ext, m) {
  m.doc() = "Sample extension for MLX";

  m.def(
      "axpby",
-      &my_ext::axpby,
+      &axpby,
      "x"_a,
      "y"_a,
      "alpha"_a,
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -1,8 +1,8 @@
 [build-system]
 requires = [
  "setuptools>=42",
-  "cmake>=3.25",
+  "cmake>=3.24",
  "mlx>=0.18.0",
-  "nanobind==2.4.0",
+  "nanobind==2.2.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
-cmake>=3.25
+cmake>=3.24
 mlx>=0.21.0
 nanobind==2.2.0
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,7 +5,6 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
@@ -19,26 +18,21 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

-if(MSVC)
-  # Disable some MSVC warnings to speed up compilation.
-  target_compile_options(mlx PUBLIC /wd4068 /wd4244 /wd4267 /wd4804)
-endif()
-
-if(WIN32)
-  # Export symbols by default to behave like macOS/linux.
-  set_target_properties(mlx PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-endif()
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
-
 if(MLX_BUILD_CPU)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cpu)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
 endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)
+if(MLX_BUILD_ACCELERATE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/accelerate)
+elseif(MLX_BUILD_CPU)
+  target_sources(
+    mlx
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/common/default_primitives.cpp)
+endif()

 if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -10,8 +10,22 @@

 namespace mlx::core {

+namespace {
+
+/** Return true if we are currently performing a function transformation in
+ * order to keep the graph when evaluating tracer arrays. */
+bool in_tracing() {
+  return detail::InTracing::in_tracing();
+}
+
+bool retain_graph() {
+  return detail::RetainGraph::retain_graph();
+}
+
+} // namespace
+
 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
-    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
  auto cval = static_cast<complex64_t>(val);
  init(&cval);
 }
@@ -47,14 +61,14 @@ std::vector<array> array::make_arrays(

 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          float32)) {
  init(data.begin());
 }

 array::array(std::initializer_list<int> data, Dtype dtype)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          dtype)) {
  init(data.begin());
 }
@@ -105,8 +119,7 @@ void array::eval() {
 }

 bool array::is_tracer() const {
-  return (array_desc_->is_tracer && detail::in_tracing()) ||
-      detail::retain_graph();
+  return array_desc_->is_tracer && in_tracing() || retain_graph();
 }

 void array::set_data(allocator::Buffer buffer, Deleter d) {
@@ -264,19 +277,7 @@ array::ArrayDesc::~ArrayDesc() {
    }
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
-      bool is_deletable =
-          (a.array_desc_.use_count() <= a.siblings().size() + 1);
-      // An array with siblings is deletable only if all of its siblings
-      // are deletable
-      for (auto& s : a.siblings()) {
-        if (!is_deletable) {
-          break;
-        }
-        int is_input = (input_map.find(s.id()) != input_map.end());
-        is_deletable &=
-            s.array_desc_.use_count() <= a.siblings().size() + is_input;
-      }
-      if (is_deletable) {
+      if (a.array_desc_.use_count() <= a.siblings().size() + 1) {
        for_deletion.push_back(std::move(a.array_desc_));
      }
    }
@@ -309,7 +310,7 @@ array::ArrayIterator::ArrayIterator(const array& arr, int idx)
 }

 array::ArrayIterator::reference array::ArrayIterator::operator*() const {
-  auto start = Shape(arr.ndim(), 0);
+  auto start = std::vector<int>(arr.ndim(), 0);
  auto end = arr.shape();
  auto shape = arr.shape();
  shape.erase(shape.begin());
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -17,8 +17,7 @@ namespace mlx::core {
 class Primitive;

 using Deleter = std::function<void(allocator::Buffer)>;
-using ShapeElem = int32_t;
-using Shape = std::vector<ShapeElem>;
+using Shape = std::vector<int32_t>;
 using Strides = std::vector<int64_t>;

 class array {
@@ -35,29 +34,29 @@ class array {
  explicit array(const std::complex<float>& val, Dtype dtype = complex64);

  template <typename It>
-  explicit array(
+  array(
      It data,
      Shape shape,
      Dtype dtype =
          TypeToDtype<typename std::iterator_traits<It>::value_type>());

  template <typename T>
-  explicit array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());
+  array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());

  /* Special case so empty lists default to float32. */
-  explicit array(std::initializer_list<float> data);
+  array(std::initializer_list<float> data);

  /* Special case so array({}, type) is an empty array. */
-  explicit array(std::initializer_list<int> data, Dtype dtype);
+  array(std::initializer_list<int> data, Dtype dtype);

  template <typename T>
-  explicit array(
+  array(
      std::initializer_list<T> data,
      Shape shape,
      Dtype dtype = TypeToDtype<T>());

  /* Build an array from a buffer */
-  explicit array(
+  array(
      allocator::Buffer data,
      Shape shape,
      Dtype dtype,
@@ -499,7 +498,7 @@ class array {

 template <typename T>
 array::array(T val, Dtype dtype /* = TypeToDtype<T>() */)
-    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
  init(&val);
 }

@@ -517,7 +516,7 @@ array::array(
    std::initializer_list<T> data,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(
-          Shape{static_cast<ShapeElem>(data.size())},
+          std::vector<int>{static_cast<int>(data.size())},
          dtype)) {
  init(data.begin());
 }
--- a/mlx/backend/accelerate/CMakeLists.txt
+++ b/mlx/backend/accelerate/CMakeLists.txt
@@ -0,0 +1,8 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp)
--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@@ -0,0 +1,20 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
+  eval(inputs, out);
+
+  // TODO: Add accelerate based optimizations for CPU conv
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/matmul.cpp
+++ b/mlx/backend/accelerate/matmul.cpp
@@ -0,0 +1,253 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+
+#include "mlx/backend/accelerate/utils.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+namespace {
+
+std::tuple<bool, size_t, array> check_transpose(const array& arr) {
+  auto stx = arr.strides()[arr.ndim() - 2];
+  auto sty = arr.strides()[arr.ndim() - 1];
+  if (stx == arr.shape(-1) && sty == 1) {
+    return std::make_tuple(false, stx, arr);
+  } else if (stx == 1 && sty == arr.shape(-2)) {
+    return std::make_tuple(true, sty, arr);
+  } else {
+    array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+    copy(arr, arr_copy, CopyType::General);
+    size_t stx = arr.shape(-1);
+    return std::make_tuple(false, stx, arr_copy);
+  }
+}
+
+inline void matmul_cblas_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    cblas_sgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        alpha, // alpha
+        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        lda,
+        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        ldb,
+        beta, // beta
+        out.data<float>() + M * N * i,
+        out.shape(-1) // ldc
+    );
+  }
+}
+
+inline void matmul_cblas(const array& a_pre, const array& b_pre, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[matmul_cblas] on CPU currently only supports float32");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_cblas_general(a_pre, b_pre, out);
+}
+
+inline void matmul_bnns_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  // TODO: Update to utilize BNNS broadcasting
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  BNNSDataType bnns_dtype = to_bnns_dtype(out.dtype());
+
+  const BNNSLayerParametersBroadcastMatMul gemm_params{
+      /* float alpha = */ alpha,
+      /* float beta = */ beta,
+      /* bool transA = */ a_transposed,
+      /* bool transB = */ b_transposed,
+      /* bool quadratic = */ false,
+      /* bool a_is_weights = */ false,
+      /* bool b_is_weights = */ false,
+      /* BNNSNDArrayDescriptor iA_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {lda, (M * K) / lda, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, lda, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+      /* BNNSNDArrayDescriptor iB_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {ldb, (K * N) / ldb, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, ldb, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+      /* BNNSNDArrayDescriptor o_desc = */
+      BNNSNDArrayDescriptor{
+          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
+          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,
+
+          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
+          {N, M, 0, 0, 0, 0, 0, 0},
+          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
+          {1, N, 0, 0, 0, 0, 0, 0},
+
+          /* void * _Nullable data = */ nullptr,
+          /* BNNSDataType data_type = */ bnns_dtype,
+
+          /* void * _Nullable table_data = */ nullptr,
+          /* BNNSDataType table_data_type = */ bnns_dtype,
+
+          /* float data_scale = */ 1.0,
+          /* float data_bias = */ 0.0,
+      },
+  };
+
+  auto bnns_filter =
+      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    BNNSFilterApplyTwoInput(
+        bnns_filter,
+        a.data<uint8_t>() +
+            elem_to_loc(M * K * i, a.shape(), a.strides()) * a.itemsize(),
+        b.data<uint8_t>() +
+            elem_to_loc(K * N * i, b.shape(), b.strides()) * b.itemsize(),
+        out.data<uint8_t>() + M * N * i * out.itemsize());
+  }
+
+  BNNSFilterDestroy(bnns_filter);
+}
+
+inline void matmul_bnns(const array& a_pre, const array& b_pre, array& out) {
+  // TODO: Update to utilize BNNS broadcasting
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_bnns_general(a_pre, b_pre, out);
+}
+
+template <typename T>
+inline void mask_matrix(
+    T* data,
+    const bool* mask,
+    int tile_size,
+    const int X,
+    const int Y,
+    const size_t X_data_str,
+    const size_t Y_data_str,
+    const size_t X_mask_str,
+    const size_t Y_mask_str) {
+  int tX = (X + tile_size - 1) / tile_size;
+  int tY = (Y + tile_size - 1) / tile_size;
+
+  for (int i = 0; i < tX; i++) {
+    for (int j = 0; j < tY; j++) {
+      bool do_mask = mask[i * X_mask_str + j * Y_mask_str];
+      if (!do_mask) {
+        int loc_x = i * tile_size;
+        int loc_y = j * tile_size;
+        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;
+
+        int size_x = std::min(tile_size, X - loc_x);
+        int size_y = std::min(tile_size, Y - loc_y);
+        for (int ii = 0; ii < size_x; ii++) {
+          for (int jj = 0; jj < size_y; jj++) {
+            data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() == float32) {
+    return matmul_cblas(inputs[0], inputs[1], out);
+  }
+  return matmul_bnns(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  if (out.dtype() == float32) {
+    return matmul_cblas_general(inputs[0], inputs[1], out, alpha_, beta_);
+  }
+  return matmul_bnns_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -0,0 +1,601 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+#include <cmath>
+
+#include <Accelerate/Accelerate.h>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/unary.h"
+#include "mlx/primitives.h"
+
+#define DEFAULT(primitive)                                                 \
+  void primitive::eval_cpu(const std::vector<array>& inputs, array& out) { \
+    primitive::eval(inputs, out);                                          \
+  }
+
+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
+namespace mlx::core {
+
+// Use the default implementation for the following primitives
+DEFAULT(Arange)
+DEFAULT(ArgPartition)
+DEFAULT(ArgReduce)
+DEFAULT(ArgSort)
+DEFAULT(AsStrided)
+DEFAULT(BlockMaskedMM)
+DEFAULT(Broadcast)
+DEFAULT(Ceil)
+DEFAULT(Concatenate)
+DEFAULT(Conjugate)
+DEFAULT(Copy)
+DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(Depends)
+DEFAULT_MULTI(DivMod)
+DEFAULT(NumberOfElements)
+DEFAULT(Equal)
+DEFAULT(Erf)
+DEFAULT(ErfInv)
+DEFAULT(FFT)
+DEFAULT(Floor)
+DEFAULT(Gather)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
+DEFAULT(Greater)
+DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
+DEFAULT(Less)
+DEFAULT(LessEqual)
+DEFAULT(Load)
+DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
+DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
+DEFAULT(NotEqual)
+DEFAULT(Pad)
+DEFAULT(Partition)
+DEFAULT_MULTI(QRF)
+DEFAULT(RandomBits)
+DEFAULT(Reshape)
+DEFAULT(Remainder)
+DEFAULT(Round)
+DEFAULT(Scatter)
+DEFAULT(Select)
+DEFAULT(Sigmoid)
+DEFAULT(Sign)
+DEFAULT(Slice)
+DEFAULT(SliceUpdate)
+DEFAULT_MULTI(Split)
+DEFAULT(Sort)
+DEFAULT(StopGradient)
+DEFAULT_MULTI(SVD)
+DEFAULT(Transpose)
+DEFAULT(Inverse)
+DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)
+
+void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vabs(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
+  } else if (in.dtype() == int32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vabsi(in.data<int>(), 1, out.data<int>(), 1, in.data_size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x + y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsadd((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsadd((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vadd((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
+        });
+  } else if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x + y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsaddi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsaddi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vaddi((const int*)a, 1, (const int*)b, 1, (int*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcCos::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvacosf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcCosh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvacoshf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcSin::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvasinf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcSinh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvasinhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvatanf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  if (out.dtype() == float32 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    if (a.is_donatable()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
+    int size = a.data_size();
+    vvatan2f(out.data<float>(), a.data<float>(), b.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvatanhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  if (in.flags().contiguous) {
+    // Use accelerate functions if possible
+    if (in.dtype() == float32 && out.dtype() == uint32) {
+      set_unary_output_data(in, out);
+      vDSP_vfixu32(
+          in.data<float>(), 1, out.data<uint32_t>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == float32 && out.dtype() == int32) {
+      set_unary_output_data(in, out);
+      vDSP_vfix32(in.data<float>(), 1, out.data<int32_t>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == uint32 && out.dtype() == float32) {
+      set_unary_output_data(in, out);
+      vDSP_vfltu32(
+          in.data<uint32_t>(), 1, out.data<float>(), 1, in.data_size());
+      return;
+    } else if (in.dtype() == int32 && out.dtype() == float32) {
+      set_unary_output_data(in, out);
+      vDSP_vflt32(in.data<int32_t>(), 1, out.data<float>(), 1, in.data_size());
+      return;
+    }
+  }
+  eval(inputs, out);
+}
+
+void Cos::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvcosf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Cosh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvcoshf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x / y; },
+        UseDefaultBinaryOp(),
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsdivi((const int*)vec, 1, (const int*)s, (int*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vdivi((const int*)b, 1, (const int*)a, 1, (int*)o, 1, n);
+        });
+  } else if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x / y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_svdiv((const float*)s, (const float*)vec, 1, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsdiv((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vdiv((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvexpf(out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Expm1::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvexpm1f(
+        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  assert(in.dtype() == out.dtype());
+  if (in.data_size() == 1 && out.dtype() == float32) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    vDSP_vfill(in.data<float>(), out.data<float>(), 1, out.size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Log::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    switch (base_) {
+      case Base::e:
+        vvlogf(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+      case Base::two:
+        vvlog2f(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+      case Base::ten:
+        vvlog10f(
+            out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+        break;
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vvlog1pf(
+        out.data<float>(), in.data<float>(), reinterpret_cast<int*>(&size));
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x * y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          vDSP_vsmul((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          vDSP_vsmul((const float*)vec, 1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vmul((const float*)a, 1, (const float*)b, 1, (float*)o, 1, n);
+        });
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    vDSP_vneg(in.data<float>(), 1, out.data<float>(), 1, in.data_size());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+  if (out.dtype() == float32 && a.flags().row_contiguous &&
+      b.flags().row_contiguous) {
+    int size = a.size();
+    if (a.is_donatable() && a.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(a);
+    } else if (b.is_donatable() && b.itemsize() == out.itemsize()) {
+      out.copy_shared_buffer(b);
+    } else {
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    }
+    vvpowf(out.data<float>(), b.data<float>(), a.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (reduce_type_ == Scan::Sum && out.dtype() == float32 &&
+      in.flags().row_contiguous && in.strides()[axis_] == 1 && !inclusive_) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    int stride = in.shape(axis_);
+    int count = in.size() / stride;
+    const float* input = in.data<float>();
+    float* output = out.data<float>();
+    float s = 1.0;
+    if (!reverse_) {
+      for (int i = 0; i < count; i++) {
+        vDSP_vrsum(input - 1, 1, &s, output, 1, stride);
+        input += stride;
+        output += stride;
+      }
+    } else {
+      for (int i = 0; i < count; i++) {
+        input += stride - 1;
+        output += stride - 1;
+        vDSP_vrsum(input + 1, -1, &s, output, -1, stride);
+        input++;
+        output++;
+      }
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sin::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvsinf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sinh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvsinhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    auto size = in.data_size();
+    vDSP_vsq(in.data<float>(), 1, out.data<float>(), 1, size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Sqrt::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    if (recip_) {
+      vvrsqrtf(out.data<float>(), in.data<float>(), &size);
+    } else {
+      vvsqrtf(out.data<float>(), in.data<float>(), &size);
+    }
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  auto& a = inputs[0];
+  auto& b = inputs[1];
+
+  if (a.dtype() == float32) {
+    binary_op<float>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x - y; },
+        [](const auto* s, const auto* vec, auto* o, auto n) {
+          float minus_1 = -1;
+          vDSP_vsmsa(
+              (const float*)vec, 1, &minus_1, (const float*)s, (float*)o, 1, n);
+        },
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          float val = -(*s);
+          vDSP_vsadd((const float*)vec, 1, &val, (float*)o, 1, n);
+        },
+        [](const auto* a, const auto* b, auto* o, auto n) {
+          vDSP_vsub((const float*)b, 1, (const float*)a, 1, (float*)o, 1, n);
+        });
+  } else if (a.dtype() == int32) {
+    binary_op<int>(
+        a,
+        b,
+        out,
+        [](auto x, auto y) { return x - y; },
+        UseDefaultBinaryOp(),
+        [](const auto* vec, const auto* s, auto* o, auto n) {
+          int val = -(*s);
+          vDSP_vsaddi((const int*)vec, 1, &val, (int*)o, 1, n);
+        },
+        UseDefaultBinaryOp());
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Tan::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvtanf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+void Tanh::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == float32 && in.flags().contiguous) {
+    set_unary_output_data(in, out);
+    int size = in.data_size();
+    vvtanhf(out.data<float>(), in.data<float>(), &size);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/quantized.cpp
+++ b/mlx/backend/accelerate/quantized.cpp
@@ -0,0 +1,117 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include <simd/vector.h>
+
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+void _qmm_t_4_64(
+    float* result,
+    const float* x,
+    const uint32_t* w,
+    const float* scales,
+    const float* biases,
+    int M,
+    int N,
+    int K,
+    int B,
+    bool batched_w) {
+  constexpr int bits = 4;
+  constexpr int group_size = 64;
+  constexpr int bitmask = (1 << bits) - 1;
+  constexpr int pack_factor = 32 / bits;
+  constexpr int packs_in_group = group_size / pack_factor;
+
+  int w_els = N * K / pack_factor;
+  int g_els = w_els * pack_factor / group_size;
+
+  for (int i = 0; i < B; i++) {
+    for (int m = 0; m < M; m++) {
+      const uint32_t* w_local = w;
+      const float* scales_local = scales;
+      const float* biases_local = biases;
+
+      for (int n = 0; n < N; n++) {
+        const simd_float16* x_local = (simd_float16*)x;
+        simd_float16 sum = 0;
+        for (int k = 0; k < K; k += group_size) {
+          float scale = *scales_local++;
+          float bias = *biases_local++;
+
+          for (int kw = 0; kw < packs_in_group; kw += 2) {
+            // TODO: vectorize this properly
+            simd_uint16 wi;
+            for (int e = 0; e < 2; e++) {
+              uint32_t wii = *w_local++;
+              for (int p = 0; p < 8; p++) {
+                wi[e * 8 + p] = wii & bitmask;
+                wii >>= bits;
+              }
+            }
+            simd_float16 wf = simd_float(wi);
+            wf *= scale;
+            wf += bias;
+
+            sum += (*x_local) * wf;
+            x_local++;
+          }
+        }
+
+        *result = simd_reduce_add(sum);
+        result++;
+      }
+
+      x += K;
+    }
+    if (batched_w) {
+      w += w_els;
+      scales += g_els;
+      biases += g_els;
+    }
+  }
+}
+
+} // namespace
+
+void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 4);
+
+  auto& x = inputs[0];
+  auto& w = inputs[1];
+  auto& scales = inputs[2];
+  auto& biases = inputs[3];
+
+  bool condition =
+      (transpose_ && x.flags().row_contiguous && w.flags().row_contiguous &&
+       scales.flags().row_contiguous && biases.flags().row_contiguous &&
+       x.dtype() == float32 && bits_ == 4 && group_size_ == 64);
+
+  if (condition) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    int K = x.shape(-1);
+    int M = x.shape(-2);
+    int N = out.shape(-1);
+    int B = x.size() / K / M;
+    bool batched_w = w.ndim() > 2;
+    _qmm_t_4_64(
+        out.data<float>(),
+        x.data<float>(),
+        w.data<uint32_t>(),
+        scales.data<float>(),
+        biases.data<float>(),
+        M,
+        N,
+        K,
+        B,
+        batched_w);
+  } else {
+    eval(inputs, out);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/reduce.cpp
+++ b/mlx/backend/accelerate/reduce.cpp
@@ -0,0 +1,139 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include <Accelerate/Accelerate.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/reduce.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename VT>
+struct MinReduction {
+  T operator()(const T& a, const T& b) {
+    return std::min(a, b);
+  }
+
+  VT operator()(VT a, VT b) {
+    return simd_min(a, b);
+  }
+};
+
+template <typename T, typename VT>
+struct MaxReduction {
+  T operator()(const T& a, const T& b) {
+    return std::max(a, b);
+  }
+
+  VT operator()(VT a, VT b) {
+    return simd_max(a, b);
+  }
+};
+
+template <typename T, typename VT>
+struct SumReduction {
+  T operator()(const T& a, const T& b) {
+    return a + b;
+  }
+
+  VT operator()(VT a, VT b) {
+    return a + b;
+  }
+};
+
+template <typename T, typename VT, int N, typename Reduction>
+struct StridedReduce {
+  void operator()(const T* x, T* accum, int size, size_t stride) {
+    Reduction op;
+
+    for (int i = 0; i < size; i++) {
+      size_t s = stride;
+      T* a = accum;
+      while (s >= N) {
+        *(VT*)a = op((*(VT*)x), (*(VT*)a));
+        x += N;
+        a += N;
+        s -= N;
+      }
+      while (s-- > 0) {
+        *a = op(*a, *x);
+        a++;
+        x++;
+      }
+    }
+  }
+};
+
+} // namespace
+
+void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  if (in.dtype() == float32) {
+    if (reduce_type_ == Reduce::Sum) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          0,
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              SumReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float acc;
+            vDSP_sve((const float*)x, 1, &acc, size);
+            (*accum) += acc;
+          },
+          [](auto* accum, auto x) { *accum += x; });
+      return;
+    } else if (reduce_type_ == Reduce::Max) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          -std::numeric_limits<float>::infinity(),
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              MaxReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float max;
+            vDSP_maxv((const float*)x, 1, &max, size);
+            (*accum) = (*accum < max) ? max : *accum;
+          },
+          [](auto* accum, auto x) { (*accum) = (*accum < x) ? x : *accum; });
+      return;
+    } else if (reduce_type_ == Reduce::Min) {
+      reduction_op<float, float>(
+          in,
+          out,
+          axes_,
+          std::numeric_limits<float>::infinity(),
+          StridedReduce<
+              float,
+              simd_float16,
+              16,
+              MinReduction<float, simd_float16>>(),
+          [](const auto* x, auto* accum, int size) {
+            float min;
+            vDSP_minv((const float*)x, 1, &min, size);
+            (*accum) = (*accum > min) ? min : *accum;
+          },
+          [](auto* accum, auto x) { (*accum) = (*accum > x) ? x : *accum; });
+      return;
+    }
+  }
+  // TODO: Add integer addition and min/max using the templates above and
+  //       simd_int16 and friends.
+  eval(inputs, out);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/softmax.cpp
+++ b/mlx/backend/accelerate/softmax.cpp
@@ -0,0 +1,393 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cassert>
+#include <limits>
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include <arm_neon.h>
+#endif
+
+#include <simd/math.h>
+#include <simd/vector.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+/**
+ * Compute exp(x) in an optimizer friendly way as follows:
+ *
+ * First change the problem to computing 2**y where y = x / ln(2).
+ *
+ * Now we will compute 2**y as 2**y1 * 2**y2 where y1 is the integer part
+ * `ipart` and y2 is fractional part. For the integer part we perform bit
+ * shifting and for the fractional part we use a polynomial approximation.
+ *
+ * The algorithm and constants of the polynomial taken from
+ * https://github.com/akohlmey/fastermath/blob/master/src/exp.c which took them
+ * from Cephes math library.
+ *
+ * Note: The implementation below is a general fast exp. There could be faster
+ *       implementations for numbers strictly < 0.
+ */
+inline simd_float16 simd_fast_exp(simd_float16 x_init) {
+  auto x = x_init * 1.442695; // multiply with log_2(e)
+  simd_float16 ipart, fpart;
+  simd_int16 epart;
+  x = simd_clamp(x, -80, 80);
+  ipart = simd::floor(x + 0.5);
+  fpart = x - ipart;
+
+  x = 1.535336188319500e-4f;
+  x = x * fpart + 1.339887440266574e-3f;
+  x = x * fpart + 9.618437357674640e-3f;
+  x = x * fpart + 5.550332471162809e-2f;
+  x = x * fpart + 2.402264791363012e-1f;
+  x = x * fpart + 6.931472028550421e-1f;
+  x = x * fpart + 1.000000000000000f;
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  epart = (simd_int(ipart) + 127) << 23;
+
+  // Avoid supressing NaNs
+  simd_int16 eq = (x_init == x_init);
+  return simd_bitselect(x_init, (*(simd_float16*)&epart) * x, eq);
+}
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/**
+ * The ARM neon equivalent of the fast exp above.
+ */
+inline float16x8_t neon_fast_exp(float16x8_t x) {
+  x = vmulq_f16(x, vdupq_n_f16(float16_t(1.442695f))); // multiply with log_2(e)
+  x = vmaxq_f16(x, vdupq_n_f16(float16_t(-14.f))); // clamp under with -14
+  x = vminq_f16(x, vdupq_n_f16(float16_t(14.f))); // clamp over with 14
+
+  float16x8_t ipart = vrndmq_f16(vaddq_f16(x, vdupq_n_f16(float16_t(0.5f))));
+  float16x8_t fpart = vsubq_f16(x, ipart);
+
+  x = vdupq_n_f16(float16_t(1.535336188319500e-4f));
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.339887440266574e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(9.618437357674640e-3f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(5.550332471162809e-2f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(2.402264791363012e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(6.931472028550421e-1f)), x, fpart);
+  x = vfmaq_f16(vdupq_n_f16(float16_t(1.000000000000000f)), x, fpart);
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  int16x8_t epart = vcvtq_s16_f16(ipart);
+  epart = vaddq_s16(epart, vdupq_n_s16(15));
+  epart = vshlq_n_s16(epart, 10);
+
+  return vmulq_f16(vreinterpretq_f16_s16(epart), x);
+}
+
+/**
+ * Implementation of folding maximum for ARM neon. This should possibly be
+ * refactored out of softmax.cpp at some point.
+ */
+inline float16_t neon_reduce_max(float16x8_t x) {
+  float16x4_t y;
+  y = vpmax_f16(vget_low_f16(x), vget_high_f16(x));
+  y = vpmax_f16(y, y);
+  y = vpmax_f16(y, y);
+  return vget_lane_f16(y, 0);
+}
+
+/**
+ * Implementation of folding sum for ARM neon. This should possibly be
+ * refactored out of softmax.cpp at some point.
+ */
+inline float16_t neon_reduce_add(float16x8_t x) {
+  float16x4_t y;
+  float16x4_t zero = vdup_n_f16(0);
+  y = vpadd_f16(vget_low_f16(x), vget_high_f16(x));
+  y = vpadd_f16(y, zero);
+  y = vpadd_f16(y, zero);
+  return vget_lane_f16(y, 0);
+}
+
+template <typename T, typename VT>
+struct NeonFp16SimdOps {
+  VT init(T a) {
+    return vdupq_n_f16(a);
+  }
+
+  VT load(const T* a) {
+    return vld1q_f16(a);
+  }
+
+  void store(T* dst, VT x) {
+    vst1q_f16(dst, x);
+  }
+
+  VT max(VT a, VT b) {
+    return vmaxq_f16(a, b);
+  }
+
+  VT exp(VT x) {
+    return neon_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return vaddq_f16(a, b);
+  }
+
+  VT sub(VT a, T b) {
+    return vsubq_f16(a, vdupq_n_f16(b));
+  }
+
+  VT mul(VT a, VT b) {
+    return vmulq_f16(a, b);
+  }
+
+  VT mul(VT a, T b) {
+    return vmulq_f16(a, vdupq_n_f16(b));
+  }
+
+  T reduce_max(VT x) {
+    return neon_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return neon_reduce_add(x);
+  }
+};
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T, typename VT>
+struct AccelerateSimdOps {
+  VT init(T a) {
+    return a;
+  }
+
+  VT load(const T* a) {
+    return *(VT*)a;
+  }
+
+  void store(T* dst, VT x) {
+    *(VT*)dst = x;
+  }
+
+  VT max(VT a, VT b) {
+    return simd_max(a, b);
+  }
+
+  VT exp(VT x) {
+    return simd_fast_exp(x);
+  }
+
+  VT add(VT a, VT b) {
+    return a + b;
+  }
+
+  VT sub(VT a, T b) {
+    return a - b;
+  }
+
+  VT mul(VT a, VT b) {
+    return a * b;
+  }
+
+  VT mul(VT a, T b) {
+    return a * b;
+  }
+
+  T reduce_max(VT x) {
+    return simd_reduce_max(x);
+  }
+
+  T reduce_add(VT x) {
+    return simd_reduce_add(x);
+  }
+};
+
+template <typename T, typename AccT, typename VT, typename Ops, int N>
+void softmax(const array& in, array& out) {
+  Ops ops;
+
+  const T* in_ptr = in.data<T>();
+  T* out_ptr = out.data<T>();
+  int M = in.shape().back();
+  int L = in.data_size() / M;
+  const T* current_in_ptr;
+  T* current_out_ptr;
+
+  for (int i = 0; i < L; i++, in_ptr += M, out_ptr += M) {
+    // Find the maximum
+    current_in_ptr = in_ptr;
+    VT vmaximum = ops.init(-std::numeric_limits<float>::infinity());
+    size_t s = M;
+    while (s >= N) {
+      VT vals;
+      if constexpr (std::is_same<T, AccT>::value) {
+        vals = ops.load(current_in_ptr);
+      } else {
+        for (int i = 0; i < N; ++i) {
+          vals[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+      }
+      vmaximum = ops.max(vals, vmaximum);
+      current_in_ptr += N;
+      s -= N;
+    }
+    AccT maximum = ops.reduce_max(vmaximum);
+    while (s-- > 0) {
+      maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
+      current_in_ptr++;
+    }
+
+    // Compute the normalizer and the exponentials
+    VT vnormalizer = ops.init(0.0);
+    current_out_ptr = out_ptr;
+    current_in_ptr = in_ptr;
+    s = M;
+    while (s >= N) {
+      VT vexp;
+      if constexpr (std::is_same<T, AccT>::value) {
+        vexp = ops.load(current_in_ptr);
+      } else {
+        for (int i = 0; i < N; ++i) {
+          vexp[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+      }
+      vexp = ops.exp(ops.sub(vexp, maximum));
+      if constexpr (std::is_same<T, AccT>::value) {
+        ops.store(current_out_ptr, vexp);
+      }
+      vnormalizer = ops.add(vnormalizer, vexp);
+      current_in_ptr += N;
+      current_out_ptr += N;
+      s -= N;
+    }
+    AccT normalizer = ops.reduce_add(vnormalizer);
+    while (s-- > 0) {
+      AccT _exp = std::exp(*current_in_ptr - maximum);
+      if (std::is_same<T, AccT>::value) {
+        *current_out_ptr = _exp;
+      }
+      normalizer += _exp;
+      current_in_ptr++;
+      current_out_ptr++;
+    }
+    normalizer = 1 / normalizer;
+
+    // Normalize
+    current_out_ptr = out_ptr;
+    current_in_ptr = in_ptr;
+    s = M;
+    while (s >= N) {
+      if constexpr (std::is_same<T, AccT>::value) {
+        ops.store(current_out_ptr, ops.mul(*(VT*)current_out_ptr, normalizer));
+      } else {
+        VT vexp;
+        for (int i = 0; i < N; ++i) {
+          vexp[i] = static_cast<AccT>(current_in_ptr[i]);
+        }
+        vexp = ops.mul(ops.exp(ops.sub(vexp, maximum)), normalizer);
+        for (int i = 0; i < N; ++i) {
+          current_out_ptr[i] = vexp[i];
+        }
+        current_in_ptr += N;
+      }
+      current_out_ptr += N;
+      s -= N;
+    }
+    while (s-- > 0) {
+      if constexpr (std::is_same<T, AccT>::value) {
+        *current_out_ptr *= normalizer;
+      } else {
+        AccT _exp = std::exp(*current_in_ptr - maximum);
+        *current_out_ptr = static_cast<T>(_exp * normalizer);
+        current_in_ptr++;
+      }
+      current_out_ptr++;
+    }
+  }
+}
+
+} // namespace
+
+void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  // Make sure that the last dimension is contiguous
+  auto check_input = [](array x) {
+    bool no_copy = x.strides()[x.ndim() - 1] == 1;
+    if (x.ndim() > 1) {
+      auto s = x.strides()[x.ndim() - 2];
+      no_copy &= (s == 0 || s == x.shape().back());
+    }
+    if (no_copy) {
+      return x;
+    } else {
+      array x_copy(x.shape(), x.dtype(), nullptr, {});
+      copy(x, x_copy, CopyType::General);
+      return x_copy;
+    }
+  };
+  array in = check_input(std::move(inputs[0]));
+  out.set_data(
+      allocator::malloc_or_wait(in.data_size() * in.itemsize()),
+      in.data_size(),
+      in.strides(),
+      in.flags());
+
+  switch (in.dtype()) {
+    case bool_:
+    case uint8:
+    case uint16:
+    case uint32:
+    case uint64:
+    case int8:
+    case int16:
+    case int32:
+    case int64:
+      throw std::invalid_argument(
+          "Softmax is defined only for floating point types");
+      break;
+    case float32:
+      softmax<
+          float,
+          float,
+          simd_float16,
+          AccelerateSimdOps<float, simd_float16>,
+          16>(in, out);
+      break;
+    case float16:
+      if (precise_) {
+        softmax<
+            float16_t,
+            float,
+            simd_float16,
+            AccelerateSimdOps<float, simd_float16>,
+            16>(in, out);
+      } else {
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        softmax<
+            float16_t,
+            float16_t,
+            float16x8_t,
+            NeonFp16SimdOps<float16_t, float16x8_t>,
+            8>(in, out);
+#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        eval(inputs, out); // Redirect to common backend for consistency
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+      }
+      break;
+    case bfloat16:
+      eval(inputs, out);
+      break;
+    case complex64:
+      eval(inputs, out);
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@@ -0,0 +1,28 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+
+#include <Accelerate/Accelerate.h>
+#include "mlx/dtype.h"
+
+namespace mlx::core {
+
+BNNSDataType to_bnns_dtype(Dtype mlx_dtype) {
+  uint32_t size_bits = size_of(mlx_dtype) * 8;
+  switch (kindof(mlx_dtype)) {
+    case Dtype::Kind::b:
+      return BNNSDataTypeBoolean;
+    case Dtype::Kind::u:
+      return BNNSDataType(BNNSDataTypeUIntBit | size_bits);
+    case Dtype::Kind::i:
+      return BNNSDataType(BNNSDataTypeIntBit | size_bits);
+    case Dtype::Kind::f:
+      return BNNSDataType(BNNSDataTypeFloatBit | size_bits);
+    case Dtype::Kind::V:
+      return BNNSDataTypeBFloat16;
+    case Dtype::Kind::c:
+      throw std::invalid_argument("BNNS does not support complex types");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -1,8 +1,62 @@
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  set(COMPILER ${CMAKE_C_COMPILER})
+  set(CLANG TRUE)
+else()
+  set(COMPILER ${CMAKE_CXX_COMPILER})
+endif()
+
+add_custom_command(
+  OUTPUT compiled_preamble.cpp
+  COMMAND
+    /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
+    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
+    ${PROJECT_SOURCE_DIR} ${CLANG}
+  DEPENDS make_compiled_preamble.sh
+          compiled_preamble.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
+          ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
+          ops.h)
+
+add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)
+
+add_dependencies(mlx cpu_compiled_preamble)
+
 target_sources(
  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/erf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/reduce_utils.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
+          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)
+
+if(IOS)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_nocpu.cpp)
+else()
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp)
+endif()
--- a/mlx/backend/common/arange.h
+++ b/mlx/backend/common/arange.h
--- a/mlx/backend/common/arg_reduce.cpp
+++ b/mlx/backend/common/arg_reduce.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

-#include "mlx/backend/common/utils.h"
 #include "mlx/primitives.h"
+#include "utils.h"

 namespace mlx::core {

@@ -61,7 +61,7 @@ void arg_reduce_dispatch(

 } // namespace

-void ArgReduce::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgReduce::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
--- a/mlx/backend/common/binary.cpp
+++ b/mlx/backend/common/binary.cpp
@@ -5,9 +5,9 @@
 #include <sstream>

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/binary.h"
-#include "mlx/backend/cpu/binary_ops.h"
-#include "mlx/backend/cpu/binary_two.h"
+#include "mlx/backend/common/binary.h"
+#include "mlx/backend/common/binary_two.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -15,61 +15,69 @@ namespace mlx::core {

 namespace {

+template <typename T, typename U, typename Op>
+void comparison_op(const array& a, const array& b, array& out, Op op) {
+  DefaultScalarVector<T, U, Op> opsv(op);
+  DefaultVectorScalar<T, U, Op> opvs(op);
+  DefaultVectorVector<T, U, Op> opvv(op);
+  binary_op<T, U>(a, b, out, op, opsv, opvs, opvv);
+}
+
 template <typename Op>
 void comparison_op(const array& a, const array& b, array& out, Op op) {
  switch (a.dtype()) {
    case bool_:
-      binary_op<bool, bool>(a, b, out, op);
+      comparison_op<bool, bool>(a, b, out, op);
      break;
    case uint8:
-      binary_op<uint8_t, bool>(a, b, out, op);
+      comparison_op<uint8_t, bool>(a, b, out, op);
      break;
    case uint16:
-      binary_op<uint16_t, bool>(a, b, out, op);
+      comparison_op<uint16_t, bool>(a, b, out, op);
      break;
    case uint32:
-      binary_op<uint32_t, bool>(a, b, out, op);
+      comparison_op<uint32_t, bool>(a, b, out, op);
      break;
    case uint64:
-      binary_op<uint64_t, bool>(a, b, out, op);
+      comparison_op<uint64_t, bool>(a, b, out, op);
      break;
    case int8:
-      binary_op<int8_t, bool>(a, b, out, op);
+      comparison_op<int8_t, bool>(a, b, out, op);
      break;
    case int16:
-      binary_op<int16_t, bool>(a, b, out, op);
+      comparison_op<int16_t, bool>(a, b, out, op);
      break;
    case int32:
-      binary_op<int32_t, bool>(a, b, out, op);
+      comparison_op<int32_t, bool>(a, b, out, op);
      break;
    case int64:
-      binary_op<int64_t, bool>(a, b, out, op);
+      comparison_op<int64_t, bool>(a, b, out, op);
      break;
    case float16:
-      binary_op<float16_t, bool>(a, b, out, op);
+      comparison_op<float16_t, bool>(a, b, out, op);
      break;
    case float32:
-      binary_op<float, bool>(a, b, out, op);
+      comparison_op<float, bool>(a, b, out, op);
      break;
    case bfloat16:
-      binary_op<bfloat16_t, bool>(a, b, out, op);
+      comparison_op<bfloat16_t, bool>(a, b, out, op);
      break;
    case complex64:
-      binary_op<complex64_t, bool>(a, b, out, op);
+      comparison_op<complex64_t, bool>(a, b, out, op);
      break;
  }
 }

 } // namespace

-void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Add::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Add());
 }

-void DivMod::eval_cpu(
+void DivMod::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 2);
@@ -124,68 +132,50 @@ void DivMod::eval_cpu(
  }
 }

-void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Divide::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Divide());
 }

-void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Remainder::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Remainder());
 }

-void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Equal::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
-  auto& a = inputs[0];
-  auto& b = inputs[1];
  if (equal_nan_) {
-    switch (a.dtype()) {
-      case float16:
-        binary_op<float16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case float32:
-        binary_op<float, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case bfloat16:
-        binary_op<bfloat16_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      case complex64:
-        binary_op<complex64_t, bool>(a, b, out, detail::NaNEqual());
-        break;
-      default:
-        throw std::runtime_error(
-            "[NanEqual::eval_cpu] Only for floating point types.");
-    }
+    comparison_op(inputs[0], inputs[1], out, detail::NaNEqual());
  } else {
-    comparison_op(a, b, out, detail::Equal());
+    comparison_op(inputs[0], inputs[1], out, detail::Equal());
  }
 }

-void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Greater::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::Greater());
 }

-void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GreaterEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::GreaterEqual());
 }

-void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Less::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::Less());
 }

-void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LessEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::LessEqual());
 }

-void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogAddExp::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -206,54 +196,54 @@ void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogicalAnd::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalAnd());
 }

-void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
+void LogicalOr::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary(in1, in2, out, detail::LogicalOr());
 }

-void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Maximum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Maximum());
 }

-void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Minimum::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Minimum());
 }

-void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Multiply::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Multiply());
 }

-void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
+void NotEqual::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op(inputs[0], inputs[1], out, detail::NotEqual());
 }

-void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Power::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary(a, b, out, detail::Power());
 }

-void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Subtract::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
@@ -317,7 +307,7 @@ void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArcTan2::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -1,6 +1,7 @@
 // Copyright © 2023 Apple Inc.

 #pragma once
+#include <cassert>

 #include "mlx/allocator.h"
 #include "mlx/array.h"
@@ -8,6 +9,8 @@

 namespace mlx::core {

+namespace {
+
 enum class BinaryOpType {
  ScalarScalar,
  ScalarVector,
@@ -16,7 +19,7 @@ enum class BinaryOpType {
  General,
 };

-inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
+BinaryOpType get_binary_op_type(const array& a, const array& b) {
  BinaryOpType bopt;
  if (a.data_size() == 1 && b.data_size() == 1) {
    bopt = BinaryOpType::ScalarScalar;
@@ -25,8 +28,8 @@ inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
  } else if (b.data_size() == 1 && a.flags().contiguous) {
    bopt = BinaryOpType::VectorScalar;
  } else if (
-      (a.flags().row_contiguous && b.flags().row_contiguous) ||
-      (a.flags().col_contiguous && b.flags().col_contiguous)) {
+      a.flags().row_contiguous && b.flags().row_contiguous ||
+      a.flags().col_contiguous && b.flags().col_contiguous) {
    bopt = BinaryOpType::VectorVector;
  } else {
    bopt = BinaryOpType::General;
@@ -34,7 +37,7 @@ inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
  return bopt;
 }

-inline void set_binary_op_output_data(
+void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
@@ -119,4 +122,409 @@ inline void set_binary_op_output_data(
  }
 }

+struct UseDefaultBinaryOp {};
+
+template <typename T, typename U, typename Op>
+struct DefaultVectorScalar {
+  Op op;
+
+  DefaultVectorScalar(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    T scalar = *b;
+    while (size-- > 0) {
+      *dst = op(*a, scalar);
+      dst++;
+      a++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultScalarVector {
+  Op op;
+
+  DefaultScalarVector(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    T scalar = *a;
+    while (size-- > 0) {
+      *dst = op(scalar, *b);
+      dst++;
+      b++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultVectorVector {
+  Op op;
+
+  DefaultVectorVector(Op op_) : op(op_) {}
+
+  void operator()(const T* a, const T* b, U* dst, int size) {
+    while (size-- > 0) {
+      *dst = op(*a, *b);
+      dst++;
+      a++;
+      b++;
+    }
+  }
+};
+
+template <typename T, typename U, typename Op, int D, bool Strided>
+void binary_op_dims(
+    const T* a,
+    const T* b,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];
+
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      binary_op_dims<T, U, Op, D - 1, Strided>(
+          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
+    } else {
+      if constexpr (Strided) {
+        op(a, b, out, stride_out);
+      } else {
+        *out = op(*a, *b);
+      }
+    }
+    out += stride_out;
+    a += stride_a;
+    b += stride_b;
+  }
+}
+
+template <typename T, typename U, bool Strided, typename Op>
+void binary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    int dim,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides) {
+  const T* a_ptr = a.data<T>();
+  const T* b_ptr = b.data<T>();
+  U* out_ptr = out.data<U>();
+  switch (dim) {
+    case 1:
+      binary_op_dims<T, U, Op, 1, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      binary_op_dims<T, U, Op, 2, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+    case 3:
+      binary_op_dims<T, U, Op, 3, Strided>(
+          a_ptr,
+          b_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          out_strides,
+          0);
+      return;
+  }
+
+  ContiguousIterator a_it(shape, a_strides, dim - 3);
+  ContiguousIterator b_it(shape, b_strides, dim - 3);
+  auto stride = out_strides[dim - 4];
+  for (int64_t elem = 0; elem < a.size(); elem += stride) {
+    binary_op_dims<T, U, Op, 3, Strided>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        out_strides,
+        dim - 3);
+    a_it.step();
+    b_it.step();
+  }
+}
+
+template <
+    typename T,
+    typename U,
+    typename Op,
+    typename OpSV,
+    typename OpVS,
+    typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  auto bopt = get_binary_op_type(a, b);
+  set_binary_op_output_data(a, b, out, bopt);
+
+  // The full computation is scalar scalar so call the base op once
+  if (bopt == BinaryOpType::ScalarScalar) {
+    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());
+    return;
+  }
+
+  // The full computation is scalar vector so delegate to the op
+  if (bopt == BinaryOpType::ScalarVector) {
+    opsv(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());
+    return;
+  }
+
+  // The full computation is vector scalar so delegate to the op
+  if (bopt == BinaryOpType::VectorScalar) {
+    opvs(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());
+    return;
+  }
+
+  // The full computation is vector vector so delegate to the op
+  if (bopt == BinaryOpType::VectorVector) {
+    opvv(a.data<T>(), b.data<T>(), out.data<U>(), out.size());
+    return;
+  }
+
+  // General computation so let's try to optimize
+  auto [new_shape, new_strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), out.strides()});
+  const auto& a_strides = new_strides[0];
+  const auto& b_strides = new_strides[1];
+  const auto& strides = new_strides[2];
+
+  // Get the left-most dim such that the array is row contiguous after
+  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
+    }
+    return d + 1;
+  };
+  auto a_rc_dim = leftmost_rc_dim(a_strides);
+  auto b_rc_dim = leftmost_rc_dim(b_strides);
+
+  // Get the left-most dim such that the array is a broadcasted "scalar" after
+  auto leftmost_s_dim = [](const auto& arr_strides) {
+    int d = arr_strides.size() - 1;
+    for (; d >= 0 && arr_strides[d] == 0; d--) {
+    }
+    return d + 1;
+  };
+  auto a_s_dim = leftmost_s_dim(a_strides);
+  auto b_s_dim = leftmost_s_dim(b_strides);
+
+  auto ndim = new_shape.size();
+
+  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
+  int dim = ndim;
+  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
+    bopt = BinaryOpType::VectorVector;
+    dim = d;
+    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
+    bopt = BinaryOpType::VectorScalar;
+    dim = d;
+    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
+    // contiguous
+  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
+    bopt = BinaryOpType::ScalarVector;
+    dim = d;
+  }
+
+  // Can be sure dim > 0 since otherwise we would have used one of the fully
+  // contiguous methods above. Except for the case that the flags do not
+  // correspond to the underlying contiguity.
+  if (dim == 0 || strides[dim - 1] < 16) {
+    bopt = BinaryOpType::General;
+    dim = ndim;
+  }
+
+  switch (bopt) {
+    case BinaryOpType::VectorVector:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvv, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    case BinaryOpType::VectorScalar:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opvs, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    case BinaryOpType::ScalarVector:
+      binary_op_dispatch_dims<T, U, true>(
+          a, b, out, opsv, dim, new_shape, a_strides, b_strides, strides);
+      break;
+    default:
+      binary_op_dispatch_dims<T, U, false>(
+          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
+      break;
+  }
+}
+
+template <typename T, typename Op, typename OpSV, typename OpVS, typename OpVV>
+void binary_op(
+    const array& a,
+    const array& b,
+    array& out,
+    Op op,
+    OpSV opsv,
+    OpVS opvs,
+    OpVV opvv) {
+  // TODO: The following mess of constexpr evaluations can probably be achieved
+  //       with template specializations and overloading. Would it be simpler?
+
+  if constexpr (std::is_same<decltype(opsv), UseDefaultBinaryOp>::value) {
+    if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::value) {
+      if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+        // All ops are UseDefaultBinaryOp (why oh why would someone call that?)
+        binary_op<T, T>(
+            a,
+            b,
+            out,
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            DefaultVectorVector<T, T, Op>(op));
+      } else {
+        // opsv and opvs were UseDefaultBinaryOp
+        binary_op<T, T>(
+            a,
+            b,
+            out,
+            op,
+            DefaultScalarVector<T, T, Op>(op),
+            DefaultVectorScalar<T, T, Op>(op),
+            opvv);
+      }
+    } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                             value) {
+      // opsv and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          out,
+          op,
+          DefaultScalarVector<T, T, Op>(op),
+          opvs,
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opsv was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a, b, out, op, DefaultScalarVector<T, T, Op>(op), opvs, opvv);
+    }
+  } else if constexpr (std::is_same<decltype(opvs), UseDefaultBinaryOp>::
+                           value) {
+    if (std::is_same<decltype(opvv), UseDefaultBinaryOp>::value) {
+      // opvs and opvv were UseDefaultBinaryOp
+      binary_op<T, T>(
+          a,
+          b,
+          out,
+          op,
+          opsv,
+          DefaultVectorScalar<T, T, Op>(op),
+          DefaultVectorVector<T, T, Op>(op));
+    } else {
+      // opvs was UseDefaultBinaryOp
+      binary_op<T, T>(
+          a, b, out, op, opsv, DefaultVectorScalar<T, T, Op>(op), opvv);
+    }
+  } else if constexpr (std::is_same<decltype(opvv), UseDefaultBinaryOp>::
+                           value) {
+    // opvv was UseDefaultBinaryOp
+    binary_op<T, T>(
+        a, b, out, op, opsv, opvs, DefaultVectorVector<T, T, Op>(op));
+  } else {
+    // All ops provided
+    binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);
+  }
+}
+
+template <typename T, typename Op>
+void binary_op(const array& a, const array& b, array& out, Op op) {
+  DefaultScalarVector<T, T, Op> opsv(op);
+  DefaultVectorScalar<T, T, Op> opvs(op);
+  DefaultVectorVector<T, T, Op> opvv(op);
+  binary_op<T, T>(a, b, out, op, opsv, opvs, opvv);
+}
+
+template <typename... Ops>
+void binary(const array& a, const array& b, array& out, Ops... ops) {
+  switch (out.dtype()) {
+    case bool_:
+      binary_op<bool>(a, b, out, ops...);
+      break;
+    case uint8:
+      binary_op<uint8_t>(a, b, out, ops...);
+      break;
+    case uint16:
+      binary_op<uint16_t>(a, b, out, ops...);
+      break;
+    case uint32:
+      binary_op<uint32_t>(a, b, out, ops...);
+      break;
+    case uint64:
+      binary_op<uint64_t>(a, b, out, ops...);
+      break;
+    case int8:
+      binary_op<int8_t>(a, b, out, ops...);
+      break;
+    case int16:
+      binary_op<int16_t>(a, b, out, ops...);
+      break;
+    case int32:
+      binary_op<int32_t>(a, b, out, ops...);
+      break;
+    case int64:
+      binary_op<int64_t>(a, b, out, ops...);
+      break;
+    case float16:
+      binary_op<float16_t>(a, b, out, ops...);
+      break;
+    case float32:
+      binary_op<float>(a, b, out, ops...);
+      break;
+    case bfloat16:
+      binary_op<bfloat16_t>(a, b, out, ops...);
+      break;
+    case complex64:
+      binary_op<complex64_t>(a, b, out, ops...);
+      break;
+  }
+}
+
+} // namespace
+
 } // namespace mlx::core
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -2,8 +2,8 @@

 #pragma once

+#include "mlx/backend/common/binary.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/binary.h"

 namespace mlx::core {

--- a/mlx/backend/common/cholesky.cpp
+++ b/mlx/backend/common/cholesky.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"

@@ -64,7 +64,7 @@ void cholesky_impl(const array& a, array& factor, bool upper) {
  }
 }

-void Cholesky::eval_cpu(const std::vector<array>& inputs, array& output) {
+void Cholesky::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Cholesky::eval] only supports float32.");
  }
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -42,7 +42,9 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  return move_or_copy(in, out, strides_, flags, data_size, offset_);
 }

-void broadcast(const array& in, array& out) {
+void Broadcast::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
@@ -59,14 +61,6 @@ void broadcast(const array& in, array& out) {
  move_or_copy(in, out, strides, flags, in.data_size());
 }

-void Broadcast::eval(const std::vector<array>& inputs, array& out) {
-  broadcast(inputs[0], out);
-}
-
-void BroadcastAxes::eval(const std::vector<array>& inputs, array& out) {
-  broadcast(inputs[0], out);
-}
-
 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  move_or_copy(inputs[0], out);
@@ -91,16 +85,6 @@ void Depends::eval(
  }
 }

-void ExpandDims::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  auto strides = in.strides();
-  for (auto ax : axes_) {
-    strides.insert(strides.begin() + ax, 1);
-  }
-  move_or_copy(in, out, strides, in.flags(), in.data_size());
-}
-
 void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
@@ -157,7 +141,9 @@ void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::pair<bool, Strides> prepare_reshape(const array& in, const array& out) {
+std::pair<bool, Strides> Reshape::prepare_reshape(
+    const array& in,
+    const array& out) {
  // Special case for empty arrays or row contiguous arrays
  if (in.size() == 0 || in.flags().row_contiguous) {
    return {false, out.strides()};
@@ -194,7 +180,7 @@ std::pair<bool, Strides> prepare_reshape(const array& in, const array& out) {
  return {copy_necessary, out_strides};
 }

-void shared_buffer_reshape(
+void Reshape::shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out) {
@@ -262,20 +248,6 @@ void Split::eval(
  }
 }

-void Squeeze::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-  Strides strides;
-  for (int i = 0, j = 0; i < in.ndim(); ++i) {
-    if (j < axes_.size() && i == axes_[j]) {
-      j++;
-    } else {
-      strides.push_back(in.strides(i));
-    }
-  }
-  move_or_copy(in, out, strides, in.flags(), in.data_size());
-}
-
 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  move_or_copy(inputs[0], out);
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -130,7 +130,7 @@ std::string build_lib_name(

 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const Shape& shape) {
+    const std::vector<int>& shape) {
  bool contiguous = true;
  bool all_contig = true;
  bool all_row_contig = true;
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -11,7 +11,9 @@
 namespace mlx::core {

 inline bool is_static_cast(const Primitive& p) {
-  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
+  return (
+      typeid(p) == typeid(Broadcast) || typeid(p) == typeid(Copy) ||
+      typeid(p) == typeid(StopGradient) || typeid(p) == typeid(AsType));
 }

 std::string build_lib_name(
@@ -54,7 +56,7 @@ inline bool is_scalar(const array& x) {
 // Check if we can use a contiguous operation given inputs and the output shape
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const Shape& shape);
+    const std::vector<int>& shape);

 // Allocate space for the outputs possibly with input donation
 void compiled_allocate_outputs(
--- a/mlx/backend/common/compiled_cpu.cpp
+++ b/mlx/backend/common/compiled_cpu.cpp
@@ -7,11 +7,8 @@
 #include <mutex>
 #include <shared_mutex>

-#include <fmt/format.h>
-
 #include "mlx/backend/common/compiled.h"
-#include "mlx/backend/cpu/compiled_preamble.h"
-#include "mlx/backend/cpu/jit_compiler.h"
+#include "mlx/backend/common/compiled_preamble.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"

@@ -47,9 +44,12 @@ namespace detail {
 bool compile_available_for_device(const Device& device) {
  return true;
 }
-
 } // namespace detail

+std::string get_temp_file(const std::string& name) {
+  return std::filesystem::temp_directory_path().append(name).string();
+}
+
 // Return a pointer to a compiled function
 void* compile(
    const std::string& kernel_name,
@@ -68,30 +68,24 @@ void* compile(
  std::string source_code = source_builder();
  std::string kernel_file_name;

-  // Deal with long kernel names. Maximum length for filename on macOS is 255
-  // characters, and on Windows the maximum length for whole path is 260. Clip
-  // file name with a little extra room and append a 16 character hash.
-#ifdef _WIN32
-  constexpr int max_file_name_length = 140;
-#else
+  // Deal with long kernel names. Maximum length for files on macOS is 255
+  // characters. Clip file name with a little extra room and append a 16
+  // character hash.
  constexpr int max_file_name_length = 245;
-#endif
  if (kernel_name.size() > max_file_name_length) {
    std::ostringstream file_name;
    file_name
        << std::string_view(kernel_name).substr(0, max_file_name_length - 16);
-    auto file_id =
-        std::hash<std::string>{}(kernel_name.substr(max_file_name_length - 16));
+    auto file_id = std::hash<std::string>{}(kernel_name);
    file_name << "_" << std::hex << std::setw(16) << file_id << std::dec;
    kernel_file_name = file_name.str();
  } else {
    kernel_file_name = kernel_name;
  }

-  auto output_dir = std::filesystem::temp_directory_path();
-
-  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
-  auto shared_lib_path = (output_dir / shared_lib_name).string();
+  std::ostringstream shared_lib_name;
+  shared_lib_name << "lib" << kernel_file_name << ".so";
+  auto shared_lib_path = get_temp_file(shared_lib_name.str());
  bool lib_exists = false;
  {
    std::ifstream f(shared_lib_path.c_str());
@@ -100,21 +94,24 @@ void* compile(

  if (!lib_exists) {
    // Open source file and write source code to it
-    std::string source_file_name = kernel_file_name + ".cpp";
-    auto source_file_path = (output_dir / source_file_name).string();
+    std::ostringstream source_file_name;
+    source_file_name << kernel_file_name << ".cpp";
+    auto source_file_path = get_temp_file(source_file_name.str());

    std::ofstream source_file(source_file_path);
    source_file << source_code;
    source_file.close();

-    try {
-      JitCompiler::exec(JitCompiler::build_command(
-          output_dir, source_file_name, shared_lib_name));
-    } catch (const std::exception& error) {
-      throw std::runtime_error(fmt::format(
-          "[Compile::eval_cpu] Failed to compile function {0}: {1}",
-          kernel_name,
-          error.what()));
+    std::ostringstream build_command;
+    build_command << "g++ -std=c++17 -O3 -Wall -fPIC -shared '"
+                  << source_file_path << "' -o '" << shared_lib_path << "'";
+    std::string build_command_str = build_command.str();
+    auto return_code = system(build_command_str.c_str());
+    if (return_code) {
+      std::ostringstream msg;
+      msg << "[Compile::eval_cpu] Failed to compile function " << kernel_name
+          << " with error code " << return_code << "." << std::endl;
+      throw std::runtime_error(msg.str());
    }
  }

@@ -154,11 +151,6 @@ inline void build_kernel(

  NodeNamer namer;

-#ifdef _MSC_VER
-  // Export the symbol
-  os << "__declspec(dllexport) ";
-#endif
-
  // Start the kernel
  os << "void " << kernel_name << "(void** args) {" << std::endl;

--- a/mlx/backend/common/compiled_nocpu.cpp
+++ b/mlx/backend/common/compiled_nocpu.cpp
@@ -1,7 +1,6 @@
 // Copyright © 2023-2024 Apple Inc.

-#include "mlx/compile_impl.h"
-#include "mlx/primitives.h"
+#include "mlx/backend/common/compiled.h"

 namespace mlx::core {

--- a/mlx/backend/common/compiled_preamble.h
+++ b/mlx/backend/common/compiled_preamble.h
@@ -5,8 +5,7 @@
 // clang-format off
 #include "mlx/types/half_types.h"
 #include "mlx/types/complex.h"
-#include "mlx/backend/cpu/unary_ops.h"
-#include "mlx/backend/cpu/binary_ops.h"
+#include "mlx/backend/common/ops.h"
 // clang-format on

 const char* get_kernel_preamble();
--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -3,8 +3,8 @@
 #include <cassert>
 #include <numeric>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"

@@ -726,7 +726,7 @@ void explicit_gemm_conv_1D_cpu(
  auto conv_dtype = float32;

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], C};
+  std::vector<int> padded_shape = {N, iH + 2 * padding[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -765,7 +765,7 @@ void explicit_gemm_conv_1D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  Shape strided_reshape = {N * oH, wH * C};
+  std::vector<int> strided_reshape = {N * oH, wH * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -843,7 +843,8 @@ void explicit_gemm_conv_2D_cpu(
  auto conv_dtype = out.dtype();

  // Pad input
-  Shape padded_shape = {N, iH + 2 * padding[0], iW + 2 * padding[1], C};
+  std::vector<int> padded_shape = {
+      N, iH + 2 * padding[0], iW + 2 * padding[1], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -880,7 +881,7 @@ void explicit_gemm_conv_2D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  Shape strided_reshape = {N * oH * oW, wH * wW * C};
+  std::vector<int> strided_reshape = {N * oH * oW, wH * wW * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -933,19 +934,19 @@ void explicit_gemm_conv_ND_cpu(
    const std::vector<int>& wt_dilation,
    const bool flip) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const auto iDim =
-      Shape(in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
-  const auto oDim = Shape(
+  const auto iDim = std::vector<int>(
+      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
+  const auto oDim = std::vector<int>(
      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(-1); // In channels
-  const auto wDim =
-      Shape(wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
+  const auto wDim = std::vector<int>(
+      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim

  auto conv_dtype = float32;

  // Pad input
-  Shape padded_shape(in.shape().size());
+  std::vector<int> padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
@@ -1128,7 +1129,7 @@ void conv_3D_cpu(

 } // namespace

-void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Convolution::eval(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto& in = inputs[0];
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -3,9 +3,8 @@
 #include <numeric>

 #include "mlx/allocator.h"
+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"

 namespace mlx::core {

@@ -24,7 +23,6 @@ template <typename SrcT, typename DstT>
 void copy_vector(const array& src, array& dst) {
  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
-  size_t size = src.data_size();
  std::copy(src_ptr, src_ptr + src.data_size(), dst_ptr);
 }

--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -3,6 +3,7 @@
 #pragma once

 #include "mlx/array.h"
+#include "mlx/backend/common/utils.h"

 namespace mlx::core {

@@ -22,4 +23,17 @@ enum class CopyType {
  GeneralGeneral
 };

+void copy(const array& src, array& dst, CopyType ctype);
+void copy_inplace(const array& src, array& dst, CopyType ctype);
+
+void copy_inplace(
+    const array& src,
+    array& dst,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
+    int64_t i_offset,
+    int64_t o_offset,
+    CopyType ctype);
+
 } // namespace mlx::core
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -0,0 +1,196 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <cstring>
+
+#include "mlx/array.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+
+#define DEFAULT(primitive)                                                 \
+  void primitive::eval_cpu(const std::vector<array>& inputs, array& out) { \
+    primitive::eval(inputs, out);                                          \
+  }
+
+#define DEFAULT_MULTI(primitive)                                       \
+  void primitive::eval_cpu(                                            \
+      const std::vector<array>& inputs, std::vector<array>& outputs) { \
+    primitive::eval(inputs, outputs);                                  \
+  }
+
+namespace mlx::core {
+
+DEFAULT(Abs)
+DEFAULT(Add)
+DEFAULT(Arange)
+DEFAULT(ArcCos)
+DEFAULT(ArcCosh)
+DEFAULT(ArcSin)
+DEFAULT(ArcSinh)
+DEFAULT(ArcTan)
+DEFAULT(ArcTan2)
+DEFAULT(ArcTanh)
+DEFAULT(ArgPartition)
+DEFAULT(ArgReduce)
+DEFAULT(ArgSort)
+DEFAULT(AsType)
+DEFAULT(AsStrided)
+DEFAULT(Broadcast)
+DEFAULT(BlockMaskedMM)
+DEFAULT(GatherMM)
+DEFAULT(GatherQMM)
+DEFAULT_MULTI(DivMod)
+DEFAULT(Ceil)
+DEFAULT(Concatenate)
+DEFAULT(Conjugate)
+DEFAULT(Convolution)
+DEFAULT(Copy)
+DEFAULT(Cos)
+DEFAULT(Cosh)
+DEFAULT_MULTI(CustomTransforms)
+DEFAULT_MULTI(Depends)
+DEFAULT(Divide)
+DEFAULT(NumberOfElements)
+DEFAULT(Remainder)
+DEFAULT(Equal)
+DEFAULT(Erf)
+DEFAULT(ErfInv)
+DEFAULT(Exp)
+DEFAULT(Expm1)
+DEFAULT(FFT)
+DEFAULT(Floor)
+DEFAULT(Full)
+DEFAULT(Gather)
+DEFAULT(Greater)
+DEFAULT(GreaterEqual)
+DEFAULT(Hadamard)
+DEFAULT(Less)
+DEFAULT(LessEqual)
+DEFAULT(Load)
+DEFAULT(Log)
+DEFAULT(Log1p)
+DEFAULT(LogicalNot)
+DEFAULT(LogicalAnd)
+DEFAULT(LogicalOr)
+DEFAULT(LogAddExp)
+DEFAULT(Maximum)
+DEFAULT(Minimum)
+DEFAULT(Multiply)
+DEFAULT(Negative)
+DEFAULT(NotEqual)
+DEFAULT(Pad)
+DEFAULT(Partition)
+DEFAULT(Power)
+DEFAULT_MULTI(QRF)
+DEFAULT(QuantizedMatmul)
+DEFAULT(RandomBits)
+DEFAULT(Reduce)
+DEFAULT(Reshape)
+DEFAULT(Round)
+DEFAULT(Scan)
+DEFAULT(Scatter)
+DEFAULT(Select)
+DEFAULT(Sigmoid)
+DEFAULT(Sign)
+DEFAULT(Sin)
+DEFAULT(Sinh)
+DEFAULT(Slice)
+DEFAULT(SliceUpdate)
+DEFAULT(Softmax)
+DEFAULT(Sort)
+DEFAULT_MULTI(Split)
+DEFAULT(Square)
+DEFAULT(Sqrt)
+DEFAULT(StopGradient)
+DEFAULT(Subtract)
+DEFAULT_MULTI(SVD)
+DEFAULT(Tan)
+DEFAULT(Tanh)
+DEFAULT(Transpose)
+DEFAULT(Inverse)
+DEFAULT(Cholesky)
+DEFAULT_MULTI(Eigh)
+
+namespace {
+
+inline void matmul_common_general(
+    const array& a_pre,
+    const array& b_pre,
+    array& out,
+    float alpha = 1.0f,
+    float beta = 0.0f) {
+  auto check_transpose = [](const array& arr) {
+    auto stx = arr.strides()[arr.ndim() - 2];
+    auto sty = arr.strides()[arr.ndim() - 1];
+    if (stx == arr.shape(-1) && sty == 1) {
+      return std::make_tuple(false, stx, arr);
+    } else if (stx == 1 && sty == arr.shape(-2)) {
+      return std::make_tuple(true, sty, arr);
+    } else {
+      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
+      copy(arr, arr_copy, CopyType::General);
+      stx = arr.shape(-1);
+      return std::make_tuple(false, stx, arr_copy);
+    }
+  };
+
+  auto [a_transposed, lda, a] = check_transpose(a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(b_pre);
+  size_t M = a.shape(-2);
+  size_t N = b.shape(-1);
+  size_t K = a.shape(-1);
+  if (M == 0 || N == 0) {
+    return;
+  }
+  if (K == 0) {
+    std::memset(static_cast<void*>(out.data<float>()), 0, out.nbytes());
+    return;
+  }
+
+  for (int i = 0; i < (a.size() / (M * K)); ++i) {
+    cblas_sgemm(
+        CblasRowMajor,
+        a_transposed ? CblasTrans : CblasNoTrans, // transA
+        b_transposed ? CblasTrans : CblasNoTrans, // transB
+        M,
+        N,
+        K,
+        alpha, // alpha
+        a.data<float>() + elem_to_loc(M * K * i, a.shape(), a.strides()),
+        lda,
+        b.data<float>() + elem_to_loc(K * N * i, b.shape(), b.strides()),
+        ldb,
+        beta, // beta
+        out.data<float>() + M * N * i,
+        out.shape(-1) // ldc
+    );
+  }
+}
+
+} // namespace
+
+void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[Matmul::eval_cpu] Currently only supports float32.");
+  }
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  return matmul_common_general(inputs[0], inputs[1], out);
+}
+
+void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.dtype() != float32) {
+    throw std::runtime_error(
+        "[AddMM::eval_cpu] Currently only supports float32.");
+  }
+
+  // Fill output with C
+  auto& c = inputs[2];
+  CopyType ctype = c.data_size() == 1 ? CopyType::Scalar : CopyType::General;
+  copy(c, out, ctype);
+
+  return matmul_common_general(inputs[0], inputs[1], out, alpha_, beta_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/eigh.cpp
+++ b/mlx/backend/common/eigh.cpp
@@ -2,8 +2,8 @@

 #include "mlx/allocator.h"
 #include "mlx/array.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/linalg.h"
 #include "mlx/primitives.h"

@@ -45,9 +45,7 @@ void ssyevd(

 } // namespace

-void Eigh::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void Eigh::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  const auto& a = inputs[0];
  auto& values = outputs[0];

--- a/mlx/backend/common/erf.cpp
+++ b/mlx/backend/common/erf.cpp
@@ -0,0 +1,40 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cmath>
+
+namespace mlx::core {
+
+/* Approximation to the inverse error function.
+ * Based on code from:
+ *   https://stackoverflow.com/questions/27229371/inverse-error-function-in-c#answer-49743348
+ */
+float erfinv(float a) {
+  auto t = std::fma(a, 0.0f - a, 1.0f);
+  t = std::log(t);
+  float p;
+  if (std::abs(t) > 6.125f) { // maximum ulp error = 2.35793
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = std::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = std::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = std::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = std::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = std::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = std::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = std::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    p = std::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  } else { // maximum ulp error = 2.35002
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = std::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = std::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = std::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = std::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = std::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = std::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = std::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = std::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    p = std::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  }
+  return a * p;
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/fft.cpp
+++ b/mlx/backend/common/fft.cpp
@@ -8,7 +8,7 @@

 namespace mlx::core {

-void FFT::eval_cpu(const std::vector<array>& inputs, array& out) {
+void FFT::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  std::vector<std::ptrdiff_t> strides_in(
      in.strides().begin(), in.strides().end());
--- a/mlx/backend/common/hadamard.cpp
+++ b/mlx/backend/common/hadamard.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/hadamard.h"
-#include "mlx/backend/cpu/copy.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -82,7 +82,7 @@ void hadamard(array& out, int n, int m, float scale) {
  }
 }

-void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Hadamard::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -104,4 +104,4 @@ void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-} // namespace mlx::core
+} // namespace mlx::core
--- a/mlx/backend/common/indexing.cpp
+++ b/mlx/backend/common/indexing.cpp
@@ -6,8 +6,8 @@
 #include "mlx/allocator.h"
 #include "mlx/primitives.h"

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"

 namespace mlx::core {

@@ -16,6 +16,11 @@ inline size_t offset_neg_idx(IdxT idx, size_t size) {
  return (idx < 0) ? idx + size : idx;
 }

+template <>
+inline size_t offset_neg_idx(bool idx, size_t) {
+  return idx;
+}
+
 template <>
 inline size_t offset_neg_idx(uint32_t idx, size_t) {
  return idx;
@@ -157,18 +162,21 @@ void dispatch_gather(
  }
 }

-void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Gather::eval(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc_or_wait(out.nbytes()));

  auto& src = inputs[0];
  std::vector<array> inds(inputs.begin() + 1, inputs.end());

  if (inds.empty()) {
-    dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
+    dispatch_gather<bool>(src, inds, out, axes_, slice_sizes_);
    return;
  }

  switch (inds[0].dtype()) {
+    case bool_:
+      dispatch_gather<bool>(src, inds, out, axes_, slice_sizes_);
+      break;
    case uint8:
      dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
      break;
@@ -193,142 +201,12 @@ void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
    case int64:
      dispatch_gather<int64_t>(src, inds, out, axes_, slice_sizes_);
      break;
-    default:
-      throw std::runtime_error(
-          "[Gather::eval_cpu] Cannot gather with indices type.");
-      break;
-  }
-}
-template <typename T, typename IdxT>
-void gather_axis(
-    const array& src,
-    const array& ind,
-    array& out,
-    const int axis) {
-  auto strides = ind.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = ind.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator ind_it(shape, strides, src.ndim() - 1);
-
-  strides = src.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator src_it(shape, strides, src.ndim() - 1);
-
-  auto ind_ptr = ind.data<IdxT>();
-  auto src_ptr = src.data<T>();
-  auto dst_ptr = out.data<T>();
-  auto ind_ax_stride = ind.strides(axis);
-  auto src_ax_stride = src.strides(axis);
-  auto dst_ax_stride = out.strides(axis);
-  auto ind_ax_size = ind.shape(axis);
-  auto src_ax_size = src.shape(axis);
-
-  size_t size_pre = 1;
-  size_t size_post = 1;
-  for (int i = 0; i < axis; ++i) {
-    size_pre *= ind.shape(i);
-  }
-  for (int i = axis + 1; i < ind.ndim(); ++i) {
-    size_post *= ind.shape(i);
-  }
-  size_t stride_pre = size_post * ind_ax_size;
-  for (size_t i = 0; i < size_pre; i++) {
-    for (size_t k = 0; k < size_post; k++) {
-      for (int j = 0; j < ind_ax_size; ++j) {
-        auto ind_val = offset_neg_idx(
-            ind_ptr[ind_it.loc + j * ind_ax_stride], src_ax_size);
-        dst_ptr[k + j * dst_ax_stride] =
-            src_ptr[src_it.loc + ind_val * src_ax_stride];
-      }
-      ind_it.step();
-      src_it.step();
-    }
-    dst_ptr += stride_pre;
-  }
-}
-
-template <typename IdxT>
-void dispatch_gather_axis(
-    const array& src,
-    const array& inds,
-    array& out,
-    const int axis) {
-  switch (out.dtype()) {
-    case bool_:
-      gather_axis<bool, IdxT>(src, inds, out, axis);
-      break;
-    case uint8:
-      gather_axis<uint8_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint16:
-      gather_axis<uint16_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint32:
-      gather_axis<uint32_t, IdxT>(src, inds, out, axis);
-      break;
-    case uint64:
-      gather_axis<uint64_t, IdxT>(src, inds, out, axis);
-      break;
-    case int8:
-      gather_axis<int8_t, IdxT>(src, inds, out, axis);
-      break;
-    case int16:
-      gather_axis<int16_t, IdxT>(src, inds, out, axis);
-      break;
-    case int32:
-      gather_axis<int32_t, IdxT>(src, inds, out, axis);
-      break;
-    case int64:
-      gather_axis<int64_t, IdxT>(src, inds, out, axis);
-      break;
    case float16:
-      gather_axis<float16_t, IdxT>(src, inds, out, axis);
-      break;
    case float32:
-      gather_axis<float, IdxT>(src, inds, out, axis);
-      break;
    case bfloat16:
-      gather_axis<bfloat16_t, IdxT>(src, inds, out, axis);
-      break;
    case complex64:
-      gather_axis<complex64_t, IdxT>(src, inds, out, axis);
-      break;
-  }
-}
-
-void GatherAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
-  auto& src = inputs[0];
-  auto& inds = inputs[1];
-  switch (inds.dtype()) {
-    case uint8:
-      dispatch_gather_axis<uint8_t>(src, inds, out, axis_);
-      break;
-    case uint16:
-      dispatch_gather_axis<uint16_t>(src, inds, out, axis_);
-      break;
-    case uint32:
-      dispatch_gather_axis<uint32_t>(src, inds, out, axis_);
-      break;
-    case uint64:
-      dispatch_gather_axis<uint64_t>(src, inds, out, axis_);
-      break;
-    case int8:
-      dispatch_gather_axis<int8_t>(src, inds, out, axis_);
-      break;
-    case int16:
-      dispatch_gather_axis<int16_t>(src, inds, out, axis_);
-      break;
-    case int32:
-      dispatch_gather_axis<int32_t>(src, inds, out, axis_);
-      break;
-    case int64:
-      dispatch_gather_axis<int64_t>(src, inds, out, axis_);
-      break;
-    default:
      throw std::runtime_error(
-          "[GatherAxis::eval_cpu] Cannot gather with indices type.");
+          "[Gather::eval] Cannot gather with floating point indices.");
      break;
  }
 }
@@ -418,11 +296,14 @@ void dispatch_scatter(
    const std::vector<int>& axes,
    Scatter::ReduceType rtype) {
  if (inds.empty()) {
-    dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
+    dispatch_scatter_inds<InT, bool>(out, inds, updates, axes, rtype);
    return;
  }

  switch (inds[0].dtype()) {
+    case bool_:
+      dispatch_scatter_inds<InT, bool>(out, inds, updates, axes, rtype);
+      break;
    case uint8:
      dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
      break;
@@ -447,13 +328,16 @@ void dispatch_scatter(
    case int64:
      dispatch_scatter_inds<InT, int64_t>(out, inds, updates, axes, rtype);
      break;
-    default:
+    case float16:
+    case float32:
+    case bfloat16:
+    case complex64:
      throw std::runtime_error(
-          "[Scatter::eval_cpu] Cannot scatter with indices type.");
+          "[Scatter::eval_cpu] Cannot scatter with floating point indices.");
  }
 }

-void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Scatter::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() >= 2);

  auto& src = inputs[0];
@@ -461,9 +345,7 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& updates = inputs.back();

  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
+  copy(src, out, CopyType::General);

  switch (src.dtype()) {
    case bool_:
@@ -508,167 +390,4 @@ void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-template <typename T, typename IdxT, typename OpT>
-void scatter_axis(
-    array& out,
-    const array idx,
-    const array& upd,
-    int axis,
-    const OpT& op) {
-  auto strides = idx.strides();
-  strides.erase(strides.begin() + axis);
-  auto shape = idx.shape();
-  shape.erase(shape.begin() + axis);
-  ContiguousIterator idx_it(shape, strides, upd.ndim() - 1);
-
-  strides = upd.strides();
-  strides.erase(strides.begin() + axis);
-  ContiguousIterator upd_it(shape, strides, upd.ndim() - 1);
-
-  auto idx_ptr = idx.data<IdxT>();
-  auto upd_ptr = upd.data<T>();
-  auto dst_ptr = out.data<T>();
-  auto idx_ax_stride = idx.strides(axis);
-  auto upd_ax_stride = upd.strides(axis);
-  auto dst_ax_stride = out.strides(axis);
-  auto idx_ax_size = idx.shape(axis);
-  auto dst_ax_size = out.shape(axis);
-
-  size_t size_pre = 1;
-  size_t size_post = 1;
-  for (int i = 0; i < axis; ++i) {
-    size_pre *= idx.shape(i);
-  }
-  for (int i = axis + 1; i < idx.ndim(); ++i) {
-    size_post *= idx.shape(i);
-  }
-  size_t stride_pre = size_post * dst_ax_size;
-  for (size_t i = 0; i < size_pre; i++) {
-    for (size_t k = 0; k < size_post; k++) {
-      for (int j = 0; j < idx_ax_size; ++j) {
-        auto ind_val = offset_neg_idx(
-            idx_ptr[idx_it.loc + j * idx_ax_stride], dst_ax_size);
-        op(upd_ptr[upd_it.loc + j * upd_ax_stride],
-           dst_ptr + k + ind_val * dst_ax_stride);
-      }
-      idx_it.step();
-      upd_it.step();
-    }
-    dst_ptr += stride_pre;
-  }
-}
-
-template <typename InT, typename IdxT>
-void dispatch_scatter_axis_op(
-    array& out,
-    const array& idx,
-    const array& updates,
-    int axis,
-    ScatterAxis::ReduceType rtype) {
-  switch (rtype) {
-    case ScatterAxis::None:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) = x; });
-      break;
-    case ScatterAxis::Sum:
-      scatter_axis<InT, IdxT>(
-          out, idx, updates, axis, [](auto x, auto* y) { (*y) += x; });
-      break;
-  }
-}
-
-template <typename InT>
-void dispatch_scatter_axis(
-    array& out,
-    const array& idx,
-    const array& updates,
-    int axis,
-    ScatterAxis::ReduceType rtype) {
-  switch (idx.dtype()) {
-    case uint8:
-      dispatch_scatter_axis_op<InT, uint8_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint16:
-      dispatch_scatter_axis_op<InT, uint16_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint32:
-      dispatch_scatter_axis_op<InT, uint32_t>(out, idx, updates, axis, rtype);
-      break;
-    case uint64:
-      dispatch_scatter_axis_op<InT, uint64_t>(out, idx, updates, axis, rtype);
-      break;
-    case int8:
-      dispatch_scatter_axis_op<InT, int8_t>(out, idx, updates, axis, rtype);
-      break;
-    case int16:
-      dispatch_scatter_axis_op<InT, int16_t>(out, idx, updates, axis, rtype);
-      break;
-    case int32:
-      dispatch_scatter_axis_op<InT, int32_t>(out, idx, updates, axis, rtype);
-      break;
-    case int64:
-      dispatch_scatter_axis_op<InT, int64_t>(out, idx, updates, axis, rtype);
-      break;
-    default:
-      throw std::runtime_error(
-          "[ScatterAxis::eval_cpu] Cannot scatter with indices type.");
-  }
-}
-
-void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() >= 2);
-
-  auto& src = inputs[0];
-  auto& idx = inputs[1];
-  auto& updates = inputs[2];
-
-  // Copy src into out (copy allocates memory for out)
-  auto ctype =
-      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
-  copy(src, out, ctype);
-
-  switch (src.dtype()) {
-    case bool_:
-      dispatch_scatter_axis<bool>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint8:
-      dispatch_scatter_axis<uint8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint16:
-      dispatch_scatter_axis<uint16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint32:
-      dispatch_scatter_axis<uint32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case uint64:
-      dispatch_scatter_axis<uint64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int8:
-      dispatch_scatter_axis<int8_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int16:
-      dispatch_scatter_axis<int16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int32:
-      dispatch_scatter_axis<int32_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case int64:
-      dispatch_scatter_axis<int64_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float16:
-      dispatch_scatter_axis<float16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case float32:
-      dispatch_scatter_axis<float>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case bfloat16:
-      dispatch_scatter_axis<bfloat16_t>(out, idx, updates, axis_, reduce_type_);
-      break;
-    case complex64:
-      dispatch_scatter_axis<complex64_t>(
-          out, idx, updates, axis_, reduce_type_);
-      break;
-  }
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/inverse.cpp
+++ b/mlx/backend/common/inverse.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
@@ -110,7 +110,7 @@ void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
  }
 }

-void Inverse::eval_cpu(const std::vector<array>& inputs, array& output) {
+void Inverse::eval(const std::vector<array>& inputs, array& output) {
  if (inputs[0].dtype() != float32) {
    throw std::runtime_error("[Inverse::eval] only supports float32.");
  }
--- a/mlx/backend/common/lapack.h
+++ b/mlx/backend/common/lapack.h
@@ -11,7 +11,7 @@
 #define lapack_complex_double std::complex<double>
 #endif

-#ifdef MLX_USE_ACCELERATE
+#ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
 #include <cblas.h>
--- a/mlx/backend/common/load.cpp
+++ b/mlx/backend/common/load.cpp
@@ -1,9 +1,12 @@
 // Copyright © 2023 Apple Inc.

 #include <algorithm>
+#include <cassert>
 #include <utility>

+#include "mlx/allocator.h"
 #include "mlx/backend/common/load.h"
+#include "mlx/primitives.h"

 namespace {

@@ -48,4 +51,11 @@ void load(
  }
 }

+void Load::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 0);
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  load(out, offset_, reader_, swap_endianness_);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -10,21 +10,20 @@ OUTPUT_FILE=$1
 GCC=$2
 SRCDIR=$3
 CLANG=$4
-ARCH=$5

 if [ "$CLANG" = "TRUE" ]; then
  read -r -d '' INCLUDES <<- EOM
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <vector>
+  #include <cmath>
+  #include <complex>
+  #include <cstdint>
+  #include <vector>
 EOM
-CC_FLAGS="-arch ${ARCH}"
+CC_FLAGS=""
 else
 CC_FLAGS="-std=c++17"
 fi

-CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E "$SRCDIR/mlx/backend/cpu/compiled_preamble.h" 2>/dev/null)
+CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E "$SRCDIR/mlx/backend/common/compiled_preamble.h" 2>/dev/null)

 cat << EOF > "$OUTPUT_FILE"
 const char* get_kernel_preamble() {
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -3,9 +3,9 @@
 #include <cstring>

 #include "mlx/array.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -53,7 +53,7 @@ inline void mask_matrix(

 } // namespace

-void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[BlockMaskedMM::eval] Currently only supports float32.");
@@ -210,7 +210,7 @@ void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[GatherMM::eval] Currently only supports float32.");
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -0,0 +1,680 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <complex>
+
+namespace mlx::core::detail {
+
+namespace {
+constexpr float inf = std::numeric_limits<float>::infinity();
+} // namespace
+
+typedef union {
+  int i;
+  float f;
+} IntOrFloat;
+
+inline float fast_exp(float x) {
+  if (x == -std::numeric_limits<float>::infinity()) {
+    return 0.0f;
+  } else if (x == std::numeric_limits<float>::infinity() || std::isnan(x)) {
+    return x;
+  }
+  x *= 1.442695; // multiply with log_2(e)
+  float ipart, fpart;
+  IntOrFloat epart;
+  x = std::max(-80.f, std::min(x, 80.f));
+  ipart = std::floor(x + 0.5);
+  fpart = x - ipart;
+
+  x = 1.535336188319500e-4f;
+  x = x * fpart + 1.339887440266574e-3f;
+  x = x * fpart + 9.618437357674640e-3f;
+  x = x * fpart + 5.550332471162809e-2f;
+  x = x * fpart + 2.402264791363012e-1f;
+  x = x * fpart + 6.931472028550421e-1f;
+  x = x * fpart + 1.000000000000000f;
+
+  // generate 2**ipart in the floating point representation using integer
+  // bitshifting
+  epart.i = (int(ipart) + 127) << 23;
+
+  return epart.f * x;
+}
+
+inline float fast_erf(float a) {
+  float r, s, t, u;
+  t = std::abs(a);
+  s = a * a;
+  if (t > 0.927734375f) {
+    // maximum error 0.99527 ulp
+    r = std::fma(
+        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+    u = std::fma(
+        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+    r = std::fma(r, s, u);
+    r = std::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+    r = std::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+    r = std::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+    r = std::fma(r, t, -t);
+    // TODO, replace with expm1 when implemented
+    r = 1.0f - std::exp(r);
+    r = std::copysign(r, a);
+  } else {
+    // maximum error 0.98929 ulp
+    r = -5.96761703e-4f; // -0x1.38e000p-11
+    r = std::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+    r = std::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+    r = std::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+    r = std::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+    r = std::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+    r = std::fma(r, a, a);
+  }
+  return r;
+}
+
+inline float fast_erfinv(float a) {
+  auto t = std::fma(a, 0.0f - a, 1.0f);
+  t = std::log(t);
+  float p;
+  if (std::abs(t) > 6.125f) { // maximum ulp error = 2.35793
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = std::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = std::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = std::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = std::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = std::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = std::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = std::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    p = std::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  } else { // maximum ulp error = 2.35002
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = std::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = std::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = std::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = std::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = std::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = std::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = std::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = std::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    p = std::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  }
+  return a * p;
+}
+
+struct Abs {
+  template <typename T>
+  T operator()(T x) {
+    return std::abs(x);
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct ArcCos {
+  template <typename T>
+  T operator()(T x) {
+    return std::acos(x);
+  }
+};
+
+struct ArcCosh {
+  template <typename T>
+  T operator()(T x) {
+    return std::acosh(x);
+  }
+};
+
+struct ArcSin {
+  template <typename T>
+  T operator()(T x) {
+    return std::asin(x);
+  }
+};
+
+struct ArcSinh {
+  template <typename T>
+  T operator()(T x) {
+    return std::asinh(x);
+  }
+};
+
+struct ArcTan {
+  template <typename T>
+  T operator()(T x) {
+    return std::atan(x);
+  }
+};
+
+struct ArcTan2 {
+  template <typename T>
+  T operator()(T y, T x) {
+    return std::atan2(y, x);
+  }
+};
+
+struct ArcTanh {
+  template <typename T>
+  T operator()(T x) {
+    return std::atanh(x);
+  }
+};
+
+struct Ceil {
+  template <typename T>
+  T operator()(T x) {
+    return std::ceil(x);
+  }
+  int8_t operator()(int8_t x) {
+    return x;
+  }
+  int16_t operator()(int16_t x) {
+    return x;
+  }
+  int32_t operator()(int32_t x) {
+    return x;
+  }
+  int64_t operator()(int64_t x) {
+    return x;
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct Conjugate {
+  complex64_t operator()(complex64_t x) {
+    return std::conj(x);
+  }
+};
+
+struct Cos {
+  template <typename T>
+  T operator()(T x) {
+    return std::cos(x);
+  }
+};
+
+struct Cosh {
+  template <typename T>
+  T operator()(T x) {
+    return std::cosh(x);
+  }
+};
+
+struct Erf {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<T>(fast_erf(static_cast<float>(x)));
+  }
+};
+
+struct ErfInv {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<T>(fast_erfinv(static_cast<float>(x)));
+  }
+};
+
+struct Exp {
+  template <typename T>
+  T operator()(T x) {
+    return fast_exp(x);
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return std::exp(x);
+  }
+};
+
+struct Expm1 {
+  template <typename T>
+  T operator()(T x) {
+    return expm1(x);
+  }
+};
+
+struct Floor {
+  template <typename T>
+  T operator()(T x) {
+    return std::floor(x);
+  }
+  int8_t operator()(int8_t x) {
+    return x;
+  }
+  int16_t operator()(int16_t x) {
+    return x;
+  }
+  int32_t operator()(int32_t x) {
+    return x;
+  }
+  int64_t operator()(int64_t x) {
+    return x;
+  }
+  uint8_t operator()(uint8_t x) {
+    return x;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x;
+  }
+  bool operator()(bool x) {
+    return x;
+  }
+};
+
+struct Imag {
+  template <typename T>
+  T operator()(T x) {
+    return std::imag(x);
+  }
+};
+
+struct Log {
+  template <typename T>
+  T operator()(T x) {
+    return std::log(x);
+  }
+};
+
+struct Log2 {
+  template <typename T>
+  T operator()(T x) {
+    return std::log2(x);
+  }
+};
+
+struct Log10 {
+  template <typename T>
+  T operator()(T x) {
+    return std::log10(x);
+  }
+};
+
+struct Log1p {
+  template <typename T>
+  T operator()(T x) {
+    return log1p(x);
+  }
+};
+
+struct LogicalNot {
+  template <typename T>
+  T operator()(T x) {
+    return !x;
+  }
+};
+
+struct Negative {
+  template <typename T>
+  T operator()(T x) {
+    return -x;
+  }
+};
+
+struct Real {
+  template <typename T>
+  T operator()(T x) {
+    return std::real(x);
+  }
+};
+
+struct Round {
+  template <typename T>
+  T operator()(T x) {
+    return std::rint(x);
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return {std::rint(x.real()), std::rint(x.imag())};
+  }
+};
+
+struct Sigmoid {
+  template <typename T>
+  T operator()(T x) {
+    auto one = static_cast<decltype(x)>(1.0);
+    return one / (one + fast_exp(-x));
+  }
+};
+
+struct Sign {
+  template <typename T>
+  T operator()(T x) {
+    return (x > T(0)) - (x < T(0));
+  }
+  uint8_t operator()(uint8_t x) {
+    return x != 0;
+  }
+  uint16_t operator()(uint16_t x) {
+    return x != 0;
+  }
+  uint32_t operator()(uint32_t x) {
+    return x != 0;
+  }
+  uint64_t operator()(uint64_t x) {
+    return x != 0;
+  }
+
+  complex64_t operator()(complex64_t x) {
+    return x == complex64_t(0) ? x : x / std::abs(x);
+  }
+};
+
+struct Sin {
+  template <typename T>
+  T operator()(T x) {
+    return std::sin(x);
+  }
+};
+
+struct Sinh {
+  template <typename T>
+  T operator()(T x) {
+    return std::sinh(x);
+  }
+};
+
+struct Square {
+  template <typename T>
+  T operator()(T x) {
+    return x * x;
+  }
+};
+
+struct Sqrt {
+  template <typename T>
+  T operator()(T x) {
+    return std::sqrt(x);
+  }
+};
+
+struct Rsqrt {
+  template <typename T>
+  T operator()(T x) {
+    return static_cast<decltype(x)>(1.0) / std::sqrt(x);
+  }
+};
+
+struct Tan {
+  template <typename T>
+  T operator()(T x) {
+    return std::tan(x);
+  }
+};
+
+struct Tanh {
+  template <typename T>
+  T operator()(T x) {
+    return std::tanh(x);
+  }
+};
+
+struct Add {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x + y;
+  }
+};
+
+struct Divide {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x / y;
+  }
+};
+
+struct Remainder {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T> & !std::is_signed_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    return numerator % denominator;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T> & std::is_signed_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    auto r = numerator % denominator;
+    if (r != 0 && (r < 0 != denominator < 0))
+      r += denominator;
+    return r;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(
+      T numerator,
+      T denominator) {
+    auto r = std::fmod(numerator, denominator);
+    if (r != 0 && (r < 0 != denominator < 0)) {
+      r += denominator;
+    }
+    return r;
+  }
+
+  complex64_t operator()(complex64_t numerator, complex64_t denominator) {
+    return numerator % denominator;
+  }
+};
+
+struct Equal {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x == y;
+  }
+};
+
+struct NaNEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    if constexpr (std::is_integral_v<T>) {
+      // isnan always returns false for integers, and MSVC refuses to compile.
+      return x == y;
+    } else {
+      return x == y || (std::isnan(x) && std::isnan(y));
+    }
+  }
+};
+
+struct Greater {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x > y;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x >= y;
+  }
+};
+
+struct Less {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x < y;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x <= y;
+  }
+};
+
+struct Maximum {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T x, T y) {
+    return (x > y) ? x : y;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T x, T y) {
+    if (std::isnan(x)) {
+      return x;
+    }
+    return (x > y) ? x : y;
+  }
+};
+
+struct Minimum {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T x, T y) {
+    return x < y ? x : y;
+  }
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T x, T y) {
+    if (std::isnan(x)) {
+      return x;
+    }
+    return x < y ? x : y;
+  }
+};
+
+struct LogAddExp {
+  template <typename T>
+  T operator()(T x, T y) {
+    constexpr float inf = std::numeric_limits<float>::infinity();
+    auto maxval = Maximum()(x, y);
+    auto minval = Minimum()(x, y);
+    return (minval == -inf || maxval == inf)
+        ? maxval
+        : static_cast<decltype(x)>(
+              maxval + std::log1p(fast_exp(minval - maxval)));
+  }
+};
+
+struct Multiply {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x * y;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  bool operator()(T x, T y) {
+    return x != y;
+  }
+};
+
+struct Power {
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>, T> operator()(T base, T exp) {
+    return std::pow(base, exp);
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>, T> operator()(T base, T exp) {
+    T res = 1;
+    while (exp) {
+      if (exp & 1) {
+        res *= base;
+      }
+      exp >>= 1;
+      base *= base;
+    }
+    return res;
+  }
+};
+
+struct Subtract {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x - y;
+  }
+};
+
+struct LogicalAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x && y;
+  }
+};
+
+struct LogicalOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x || y;
+  }
+};
+
+struct Select {
+  template <typename T>
+  T operator()(bool condition, T x, T y) {
+    return condition ? x : y;
+  }
+};
+
+struct BitwiseAnd {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x & y;
+  }
+};
+
+struct BitwiseOr {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x | y;
+  }
+};
+
+struct BitwiseXor {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x ^ y;
+  }
+};
+
+struct LeftShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x << y;
+  }
+};
+
+struct RightShift {
+  template <typename T>
+  T operator()(T x, T y) {
+    return x >> y;
+  }
+};
+
+} // namespace mlx::core::detail
--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -0,0 +1,648 @@
+// Copyright © 2023-2024 Apple Inc.
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <numeric>
+#include <sstream>
+
+#include "mlx/allocator.h"
+#include "mlx/backend/common/arange.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/ops.h"
+#include "mlx/backend/common/slicing.h"
+#include "mlx/backend/common/threefry.h"
+#include "mlx/backend/common/unary.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+void Abs::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), unsignedinteger)) {
+    // No-op for unsigned types
+    out.copy_shared_buffer(in);
+  } else {
+    unary(in, out, detail::Abs());
+  }
+}
+
+void Arange::eval(const std::vector<array>& inputs, array& out) {
+  arange(inputs, out, start_, step_);
+}
+
+void ArcCos::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcCos());
+  } else {
+    throw std::invalid_argument(
+        "[arccos] Cannot compute inverse cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcCosh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcCosh());
+  } else {
+    throw std::invalid_argument(
+        "[arccosh] Cannot compute inverse hyperbolic cosine of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void ArcSin::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcSin());
+  } else {
+    throw std::invalid_argument(
+        "[arcsin] Cannot compute inverse sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcSinh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcSinh());
+  } else {
+    throw std::invalid_argument(
+        "[arcsinh] Cannot compute inverse hyperbolic sine of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void ArcTan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcTan());
+  } else {
+    throw std::invalid_argument(
+        "[arctan] Cannot compute inverse tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void ArcTanh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::ArcTanh());
+  } else {
+    throw std::invalid_argument(
+        "[arctanh] Cannot compute inverse hyperbolic tangent of elements in"
+        " array with non floating point type.");
+  }
+}
+
+void AsType::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  copy(in, out, ctype);
+}
+
+void Ceil::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Ceil());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Concatenate::eval(const std::vector<array>& inputs, array& out) {
+  std::vector<int> sizes;
+  sizes.push_back(0);
+  for (auto& p : inputs) {
+    sizes.push_back(p.shape(axis_));
+  }
+  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto strides = out.strides();
+  auto flags = out.flags();
+  flags.row_contiguous = false;
+  flags.col_contiguous = false;
+  flags.contiguous = false;
+  for (int i = 0; i < inputs.size(); i++) {
+    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
+    size_t data_offset = strides[axis_] * sizes[i];
+    out_slice.copy_shared_buffer(
+        out, strides, flags, out_slice.size(), data_offset);
+    copy_inplace(inputs[i], out_slice, CopyType::GeneralGeneral);
+  }
+}
+
+void Conjugate::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (out.dtype() == complex64) {
+    unary_fp(in, out, detail::Conjugate());
+  } else {
+    throw std::invalid_argument(
+        "[conjugate] conjugate must be called on complex input.");
+  }
+}
+
+void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.flags().row_contiguous ||
+      (allow_col_major_ && in.flags().col_contiguous)) {
+    out.copy_shared_buffer(in);
+  } else {
+    copy(in, out, CopyType::General);
+  }
+}
+
+void Cos::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Cos());
+  } else {
+    throw std::invalid_argument(
+        "[cos] Cannot compute cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Cosh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Cosh());
+  } else {
+    throw std::invalid_argument(
+        "[cosh] Cannot compute hyperbolic cosine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Erf::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  switch (out.dtype()) {
+    case float32:
+      unary_op<float>(in, out, detail::Erf());
+      break;
+    case float16:
+      unary_op<float16_t>(in, out, detail::Erf());
+      break;
+    case bfloat16:
+      unary_op<bfloat16_t>(in, out, detail::Erf());
+      break;
+    default:
+      throw std::invalid_argument(
+          "[erf] Error function only defined for arrays"
+          " with real floating point type.");
+  }
+}
+
+void ErfInv::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  switch (out.dtype()) {
+    case float32:
+      unary_op<float>(in, out, detail::ErfInv());
+      break;
+    case float16:
+      unary_op<float16_t>(in, out, detail::ErfInv());
+      break;
+    case bfloat16:
+      unary_op<bfloat16_t>(in, out, detail::ErfInv());
+      break;
+    default:
+      throw std::invalid_argument(
+          "[erf_inv] Inverse error function only defined for arrays"
+          " with real floating point type.");
+  }
+}
+
+void Exp::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Exp());
+  } else {
+    throw std::invalid_argument(
+        "[exp] Cannot exponentiate elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Expm1::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Expm1());
+  } else {
+    throw std::invalid_argument(
+        "[expm1] Cannot exponentiate elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Floor::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Floor());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Full::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  assert(in.dtype() == out.dtype());
+  CopyType ctype;
+  if (in.data_size() == 1) {
+    ctype = CopyType::Scalar;
+  } else if (in.flags().contiguous) {
+    ctype = CopyType::Vector;
+  } else {
+    ctype = CopyType::General;
+  }
+  copy(in, out, ctype);
+}
+
+void Imag::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Imag());
+}
+
+void Log::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    switch (base_) {
+      case Base::e:
+        unary_fp(in, out, detail::Log());
+        break;
+      case Base::two:
+        unary_fp(in, out, detail::Log2());
+        break;
+      case Base::ten:
+        unary_fp(in, out, detail::Log10());
+        break;
+    }
+  } else {
+    throw std::invalid_argument(
+        "[log] Cannot compute log of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void Log1p::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Log1p());
+  } else {
+    throw std::invalid_argument(
+        "[log1p] Cannot compute log of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void LogicalNot::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::LogicalNot());
+}
+
+void Negative::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::Negative());
+}
+
+void Pad::eval(const std::vector<array>& inputs, array& out) {
+  // Inputs must be base input array and scalar val array
+  assert(inputs.size() == 2);
+  auto& in = inputs[0];
+  auto& val = inputs[1];
+
+  // Padding value must be a scalar
+  assert(val.size() == 1);
+
+  // Padding value, input and output must be of the same type
+  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());
+
+  // Fill output with val
+  copy(val, out, CopyType::Scalar);
+
+  // Find offset for start of input values
+  size_t data_offset = 0;
+  for (int i = 0; i < axes_.size(); i++) {
+    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
+    data_offset += out.strides()[ax] * low_pad_size_[i];
+  }
+
+  // Extract slice from output where input will be pasted
+  array out_slice(in.shape(), out.dtype(), nullptr, {});
+  out_slice.copy_shared_buffer(
+      out, out.strides(), out.flags(), out_slice.size(), data_offset);
+
+  // Copy input values into the slice
+  copy_inplace(in, out_slice, CopyType::GeneralGeneral);
+}
+
+void RandomBits::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  // keys has shape (N1, ..., NK, 2)
+  // out has shape (N1, ..., NK, M1, M2, ...)
+  auto& keys = inputs[0];
+  size_t num_keys = keys.size() / 2;
+
+  size_t elems_per_key = out.size() / num_keys;
+  size_t bytes_per_key = out.itemsize() * elems_per_key;
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto kptr = inputs[0].data<uint32_t>();
+  auto cptr = out.data<char>();
+  size_t out_skip = (bytes_per_key + 4 - 1) / 4;
+  auto half_size = out_skip / 2;
+  bool even = out_skip % 2 == 0;
+  for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
+    auto ptr = reinterpret_cast<uint32_t*>(cptr);
+    // Get ith key
+    auto kidx = 2 * i;
+    auto k1_elem = elem_to_loc(kidx, keys.shape(), keys.strides());
+    auto k2_elem = elem_to_loc(kidx + 1, keys.shape(), keys.strides());
+    auto key = std::make_pair(kptr[k1_elem], kptr[k2_elem]);
+
+    std::pair<uintptr_t, uintptr_t> count{0, half_size + !even};
+    for (; count.first + 1 < half_size; count.first++, count.second++) {
+      std::tie(ptr[count.first], ptr[count.second]) =
+          random::threefry2x32_hash(key, count);
+    }
+    if (count.first < half_size) {
+      auto rb = random::threefry2x32_hash(key, count);
+      ptr[count.first++] = rb.first;
+      if (bytes_per_key % 4 > 0) {
+        std::copy(
+            reinterpret_cast<char*>(&rb.second),
+            reinterpret_cast<char*>(&rb.second) + bytes_per_key % 4,
+            cptr + 4 * count.second);
+      } else {
+        ptr[count.second] = rb.second;
+      }
+    }
+    if (!even) {
+      count.second = 0;
+      ptr[half_size] = random::threefry2x32_hash(key, count).first;
+    }
+  }
+}
+
+void Real::eval_cpu(const std::vector<array>& inputs, array& out) {
+  unary_op<complex64_t, float>(inputs[0], out, detail::Real());
+}
+
+void Reshape::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+
+  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
+
+  if (copy_necessary) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    copy_inplace(in, out, CopyType::General);
+  } else {
+    shared_buffer_reshape(in, out_strides, out);
+  }
+}
+
+void Round::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (issubdtype(in.dtype(), inexact)) {
+    unary_fp(in, out, detail::Round());
+  } else {
+    // No-op integer types
+    out.copy_shared_buffer(in);
+  }
+}
+
+void Sigmoid::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sigmoid());
+  } else {
+    throw std::invalid_argument(
+        "[sigmoid] Cannot sigmoid of elements in array with"
+        " non floating point type.");
+  }
+}
+
+void Sign::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (in.dtype() == bool_) {
+    out.copy_shared_buffer(in);
+  } else {
+    unary(in, out, detail::Sign());
+  }
+}
+
+void Sin::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sin());
+  } else {
+    throw std::invalid_argument(
+        "[sin] Cannot compute sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Sinh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Sinh());
+  } else {
+    throw std::invalid_argument(
+        "[sinh] Cannot compute hyperbolic sine of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Slice::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  auto& in = inputs[0];
+
+  // Calculate out strides, initial offset and if copy needs to be made
+  auto [data_offset, inp_strides] = prepare_slice(in, start_indices_, strides_);
+  auto copy_needed = std::any_of(
+      strides_.begin(), strides_.end(), [](auto i) { return i < 0; });
+
+  // Do copy if needed
+  if (copy_needed) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    Strides ostrides{out.strides().begin(), out.strides().end()};
+    copy_inplace(
+        /* const array& src = */ in,
+        /* array& dst = */ out,
+        /* const std::vector<int>& data_shape = */ out.shape(),
+        /* const std::vector<stride_t>& i_strides = */ inp_strides,
+        /* const std::vector<stride_t>& o_strides = */ ostrides,
+        /* int64_t i_offset = */ data_offset,
+        /* int64_t o_offset = */ 0,
+        /* CopyType ctype = */ CopyType::General);
+  } else {
+    size_t data_end = 1;
+    for (int i = 0; i < end_indices_.size(); ++i) {
+      if (in.shape()[i] > 1) {
+        auto end_idx = start_indices_[i] + out.shape()[i] * strides_[i] - 1;
+        data_end += end_idx * in.strides()[i];
+      }
+    }
+    size_t data_size = data_end - data_offset;
+    Strides ostrides{inp_strides.begin(), inp_strides.end()};
+    shared_buffer_slice(in, ostrides, data_offset, data_size, out);
+  }
+}
+
+void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 2);
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  auto& in = inputs[0];
+  auto& upd = inputs[1];
+
+  if (upd.size() == 0) {
+    out.copy_shared_buffer(in);
+    return;
+  }
+
+  // Check if materialization is needed
+  auto ctype = in.flags().contiguous && in.size() == in.data_size()
+      ? CopyType::Vector
+      : CopyType::General;
+  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);
+
+  // Calculate out strides, initial offset and if copy needs to be made
+  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);
+
+  // Do copy
+  Strides upd_strides{upd.strides().begin(), upd.strides().end()};
+  copy_inplace(
+      /* const array& src = */ upd,
+      /* array& dst = */ out,
+      /* const std::vector<int>& data_shape = */ upd.shape(),
+      /* const std::vector<stride_t>& i_strides = */ upd_strides,
+      /* const std::vector<stride_t>& o_strides = */ out_strides,
+      /* int64_t i_offset = */ 0,
+      /* int64_t o_offset = */ data_offset,
+      /* CopyType ctype = */ CopyType::GeneralGeneral);
+}
+
+void Square::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  unary(in, out, detail::Square());
+}
+
+void Sqrt::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  if (recip_) {
+    unary_fp(in, out, detail::Rsqrt());
+  } else {
+    unary_fp(in, out, detail::Sqrt());
+  }
+}
+
+void Tan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Tan());
+  } else {
+    throw std::invalid_argument(
+        "[tan] Cannot compute tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void Tanh::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  if (issubdtype(out.dtype(), inexact)) {
+    unary_fp(in, out, detail::Tanh());
+  } else {
+    throw std::invalid_argument(
+        "[tanh] Cannot compute hyperbolic tangent of elements in array"
+        " with non floating point type.");
+  }
+}
+
+void View::eval_cpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  auto ibytes = size_of(in.dtype());
+  auto obytes = size_of(out.dtype());
+  // Conditions for buffer copying (disjunction):
+  // - type size is the same
+  // - type size is smaller and the last axis is contiguous
+  // - the entire array is row contiguous
+  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
+      in.flags().row_contiguous) {
+    auto strides = in.strides();
+    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
+      strides[i] *= ibytes;
+      strides[i] /= obytes;
+    }
+    out.copy_shared_buffer(
+        in, strides, in.flags(), in.data_size() * ibytes / obytes);
+  } else {
+    auto tmp = array(
+        in.shape(), in.dtype() == bool_ ? uint8 : in.dtype(), nullptr, {});
+    tmp.set_data(allocator::malloc_or_wait(tmp.nbytes()));
+    if (in.dtype() == bool_) {
+      auto in_tmp = array(in.shape(), uint8, nullptr, {});
+      in_tmp.copy_shared_buffer(in);
+      copy_inplace(in_tmp, tmp, CopyType::General);
+    } else {
+      copy_inplace(in, tmp, CopyType::General);
+    }
+
+    auto flags = out.flags();
+    flags.contiguous = true;
+    flags.row_contiguous = true;
+    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
+    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
+    out.move_shared_buffer(tmp, out.strides(), flags, out.size());
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2023-2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -41,7 +41,7 @@ template <typename T>
 void qrf_impl(const array& a, array& q, array& r) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
-  const int lda = M;
+  const int lda = std::max(M, N);
  size_t num_matrices = a.size() / (M * N);
  int num_reflectors = std::min(M, N);
  auto tau =
@@ -89,16 +89,13 @@ void qrf_impl(const array& a, array& q, array& r) {
  allocator::free(work);

  r.set_data(allocator::malloc_or_wait(r.nbytes()));
+  copy_inplace(in, r, CopyType::General);

  for (int i = 0; i < num_matrices; ++i) {
-    /// num_reflectors x N
+    // Zero lower triangle
    for (int j = 0; j < r.shape(-2); ++j) {
      for (int k = 0; k < j; ++k) {
-        r.data<T>()[i * N * num_reflectors + j * N + k] = 0;
-      }
-      for (int k = j; k < r.shape(-1); ++k) {
-        r.data<T>()[i * N * num_reflectors + j * N + k] =
-            in.data<T>()[i * N * M + j + k * M];
+        r.data<T>()[i * N * M + j * N + k] = 0;
      }
    }
  }
@@ -107,7 +104,7 @@ void qrf_impl(const array& a, array& q, array& r) {
  lwork = -1;
  lpack<T>::xorgqr(
      &M,
-      &num_reflectors,
+      &N,
      &num_reflectors,
      nullptr,
      &lda,
@@ -123,7 +120,7 @@ void qrf_impl(const array& a, array& q, array& r) {
    // Compute Q
    lpack<T>::xorgqr(
        &M,
-        &num_reflectors,
+        &N,
        &num_reflectors,
        in.data<float>() + M * N * i,
        &lda,
@@ -134,24 +131,14 @@ void qrf_impl(const array& a, array& q, array& r) {
  }

  q.set_data(allocator::malloc_or_wait(q.nbytes()));
-  for (int i = 0; i < num_matrices; ++i) {
-    // M x num_reflectors
-    for (int j = 0; j < q.shape(-2); ++j) {
-      for (int k = 0; k < q.shape(-1); ++k) {
-        q.data<T>()[i * M * num_reflectors + j * num_reflectors + k] =
-            in.data<T>()[i * N * M + j + k * M];
-      }
-    }
-  }
+  copy_inplace(in, q, CopyType::General);

  // Cleanup
  allocator::free(work);
  allocator::free(tau);
 }

-void QRF::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void QRF::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  if (!(inputs[0].dtype() == float32)) {
    throw std::runtime_error("[QRF::eval] only supports float32.");
  }
--- a/mlx/backend/common/quantized.cpp
+++ b/mlx/backend/common/quantized.cpp
@@ -2,8 +2,8 @@

 #include <cassert>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/primitives.h"
 #include "mlx/utils.h"
@@ -151,78 +151,6 @@ void _qmm_t(
  }
 }

-template <int bits, int S>
-simd::Simd<uint32_t, S> extract_bits_simd(const uint32_t* w) {
-  constexpr int bitmask = (1 << bits) - 1;
-  simd::Simd<uint32_t, S> wi;
-  if constexpr (bits == 4 && S == 8) {
-    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
-    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
-    wi = simd::Simd<uint32_t, S>(*w);
-    wi = wi >> shifts;
-    wi = wi & bitmask;
-  } else if constexpr (bits == 8 && S == 8) {
-    constexpr std::array<uint32_t, 8> shifts_ = {{0, 8, 16, 24, 0, 8, 16, 24}};
-    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
-    auto l = simd::Simd<uint32_t, 4>(*w++);
-    auto r = simd::Simd<uint32_t, 4>(*w);
-    wi = simd::Simd<uint32_t, S>(l, r);
-    wi = wi >> shifts;
-    wi = wi & bitmask;
-  } else {
-    // Appease compiler.. but should never get here
-    throw std::runtime_error("Unsupported combination for simd qmm.");
-  }
-  return wi;
-}
-
-template <typename T, int bits, int group_size>
-void _qmm_t_simd(
-    T* result,
-    const T* x,
-    const uint32_t* w,
-    const T* scales,
-    const T* biases,
-    int M,
-    int N,
-    int K) {
-  constexpr int pack_factor = 32 / bits;
-  constexpr int packs_in_group = group_size / pack_factor;
-  constexpr int S = simd::max_size<T>;
-  static_assert(
-      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
-  constexpr int packs_per_simd = S / pack_factor;
-
-  for (int m = 0; m < M; m++) {
-    const uint32_t* w_local = w;
-    const T* scales_local = scales;
-    const T* biases_local = biases;
-
-    for (int n = 0; n < N; n++) {
-      simd::Simd<float, S> acc(0);
-      auto x_local = x;
-      for (int k = 0; k < K; k += group_size) {
-        T scale = *scales_local++;
-        T bias = *biases_local++;
-
-        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
-          auto wf = simd::Simd<float, S>(extract_bits_simd<bits, S>(w_local));
-          w_local += packs_per_simd;
-          wf = wf * scale;
-          wf = wf + bias;
-          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
-          acc = acc + x_simd * wf;
-          x_local += S;
-        }
-      }
-
-      *result = T(simd::sum(acc));
-      result++;
-    }
-    x += K;
-  }
-}
-
 template <typename T, int bits, int group_size>
 void _qmm_dispatch_transpose(
    T* result,
@@ -235,14 +163,9 @@ void _qmm_dispatch_transpose(
    int K,
    bool transposed_w) {
  if (transposed_w) {
-    // the simd size must be a multiple of the number of elements per word
-    if constexpr (32 % bits == 0 && simd::max_size<T> % (32 / bits) == 0) {
-      _qmm_t_simd<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
-    } else {
-      _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
-    }
+    return _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
  } else {
-    _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
+    return _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
  }
 }

@@ -326,13 +249,13 @@ void _qmm_dispatch(
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
-  int M = x.ndim() > 1 ? x.shape(-2) : 1;
+  int M = x.shape(-2);
  int N = out.shape(-1);

  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;

-  int batch_size = x.size() / (K * M);
+  int batch_size = x.size() / x.shape(-1) / x.shape(-2);
  for (int i = 0; i < batch_size; i++) {
    switch (x.dtype()) {
      case float32:
@@ -461,7 +384,7 @@ void _bs_qmm_dispatch(

 } // namespace

-void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
+void QuantizedMatmul::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 4);

  auto& x_pre = inputs[0];
@@ -488,7 +411,7 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
 }

-void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+void GatherQMM::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 6);

  auto& x_pre = inputs[0];
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -1,147 +1,312 @@
-// Copyright © 2024 Apple Inc.
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+#include <functional>
+#include <limits>

 #include "mlx/backend/common/reduce.h"
+#include "mlx/primitives.h"

 namespace mlx::core {

-std::pair<Shape, Strides> shapes_without_reduction_axes(
-    const array& x,
-    const std::vector<int>& axes) {
-  auto shape = x.shape();
-  auto strides = x.strides();
+namespace {

-  for (int i = axes.size() - 1; i >= 0; i--) {
-    int a = axes[i];
-    shape.erase(shape.begin() + a);
-    strides.erase(strides.begin() + a);
+template <typename U>
+struct Limits {
+  static const U max;
+  static const U min;
+};
+
+#define instantiate_default_limit(type)                           \
+  template <>                                                     \
+  struct Limits<type> {                                           \
+    static constexpr type max = std::numeric_limits<type>::max(); \
+    static constexpr type min = std::numeric_limits<type>::min(); \
+  };
+
+instantiate_default_limit(uint8_t);
+instantiate_default_limit(uint16_t);
+instantiate_default_limit(uint32_t);
+instantiate_default_limit(uint64_t);
+instantiate_default_limit(int8_t);
+instantiate_default_limit(int16_t);
+instantiate_default_limit(int32_t);
+instantiate_default_limit(int64_t);
+
+#define instantiate_float_limit(type) \
+  template <>                         \
+  struct Limits<type> {               \
+    static const type max;            \
+    static const type min;            \
+  };
+
+instantiate_float_limit(float16_t);
+instantiate_float_limit(bfloat16_t);
+instantiate_float_limit(float);
+instantiate_float_limit(complex64_t);
+
+template <>
+struct Limits<bool> {
+  static constexpr bool max = true;
+  static constexpr bool min = false;
+};
+
+const float Limits<float>::max = std::numeric_limits<float>::infinity();
+const float Limits<float>::min = -std::numeric_limits<float>::infinity();
+const bfloat16_t Limits<bfloat16_t>::max =
+    std::numeric_limits<float>::infinity();
+const bfloat16_t Limits<bfloat16_t>::min =
+    -std::numeric_limits<float>::infinity();
+const float16_t Limits<float16_t>::max = std::numeric_limits<float>::infinity();
+const float16_t Limits<float16_t>::min =
+    -std::numeric_limits<float>::infinity();
+const complex64_t Limits<complex64_t>::max =
+    std::numeric_limits<float>::infinity();
+const complex64_t Limits<complex64_t>::min =
+    -std::numeric_limits<float>::infinity();
+
+struct AndReduce {
+  template <typename T>
+  void operator()(bool* a, T b) {
+    (*a) &= (b != 0);
  }

-  return std::make_pair(shape, strides);
+  void operator()(bool* y, bool x) {
+    (*y) &= x;
+  }
+};
+
+struct OrReduce {
+  template <typename T>
+  void operator()(bool* a, T b) {
+    (*a) |= (b != 0);
+  }
+
+  void operator()(bool* y, bool x) {
+    (*y) |= x;
+  }
+};
+
+struct MaxReduce {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
+    (*y) = (*y > x) ? *y : x;
+  };
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
+    if (std::isnan(x)) {
+      *y = x;
+    } else {
+      (*y) = (*y > x) ? *y : x;
+    }
+  };
+};
+
+struct MinReduce {
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>> operator()(T* y, T x) {
+    (*y) = (*y < x) ? *y : x;
+  };
+
+  template <typename T>
+  std::enable_if_t<!std::is_integral_v<T>> operator()(T* y, T x) {
+    if (std::isnan(x)) {
+      *y = x;
+    } else {
+      (*y) = (*y < x) ? *y : x;
+    }
+  };
+};
+
+template <typename InT>
+void reduce_dispatch_and_or(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::And) {
+    reduction_op<InT, bool>(in, out, axes, true, AndReduce());
+  } else {
+    reduction_op<InT, bool>(in, out, axes, false, OrReduce());
+  }
 }

-ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
-  // The data is all there and we are reducing over everything
-  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
-      x.flags().contiguous) {
-    return ContiguousAllReduce;
+template <typename InT>
+void reduce_dispatch_sum_prod(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Sum) {
+    auto op = [](auto y, auto x) { (*y) = (*y) + x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 0, op);
+    } else {
+      reduction_op<InT, InT>(in, out, axes, 0, op);
+    }
+  } else {
+    auto op = [](auto y, auto x) { (*y) *= x; };
+    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t>(in, out, axes, 1, op);
+    } else {
+      reduction_op<InT, InT>(in, out, axes, 1, op);
+    }
  }
+}

-  // Row contiguous input so the output is row contiguous
-  if (x.flags().row_contiguous) {
-    // Merge consecutive axes
-    Shape shape = {x.shape(axes[0])};
-    Strides strides = {x.strides()[axes[0]]};
-    for (int i = 1; i < axes.size(); i++) {
-      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
-        shape.back() *= x.shape(axes[i]);
-        strides.back() = x.strides()[axes[i]];
-      } else {
-        shape.push_back(x.shape(axes[i]));
-        strides.push_back(x.strides()[axes[i]]);
+template <typename InT>
+void reduce_dispatch_min_max(
+    const array& in,
+    array& out,
+    Reduce::ReduceType rtype,
+    const std::vector<int>& axes) {
+  if (rtype == Reduce::Max) {
+    auto init = Limits<InT>::min;
+    reduction_op<InT, InT>(in, out, axes, init, MaxReduce());
+  } else {
+    auto init = Limits<InT>::max;
+    reduction_op<InT, InT>(in, out, axes, init, MinReduce());
+  }
+}
+
+} // namespace
+
+void nd_loop(
+    std::function<void(int)> callback,
+    const Shape& shape,
+    const Strides& strides) {
+  std::function<void(int, int)> loop_inner;
+  loop_inner = [&](int dim, int offset) {
+    if (dim < shape.size() - 1) {
+      auto size = shape[dim];
+      auto stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        loop_inner(dim + 1, offset + i * stride);
+      }
+    } else {
+      auto size = shape[dim];
+      auto stride = strides[dim];
+      for (int i = 0; i < size; i++) {
+        callback(offset + i * stride);
      }
    }
+  };
+  loop_inner(0, 0);
+}

-    // Remove singleton axes from the plan
-    for (int i = shape.size() - 1; i >= 0; i--) {
-      if (shape[i] == 1) {
-        shape.erase(shape.begin() + i);
-        strides.erase(strides.begin() + i);
+void Reduce::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+  switch (reduce_type_) {
+    case Reduce::And:
+    case Reduce::Or: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_and_or<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+        case float16:
+        case bfloat16:
+          reduce_dispatch_and_or<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+        case int32:
+        case float32:
+          reduce_dispatch_and_or<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+        case int64:
+        case complex64:
+          reduce_dispatch_and_or<int64_t>(in, out, reduce_type_, axes_);
+          break;
      }
+      break;
    }
-
-    if (strides.back() == 1) {
-      return ReductionPlan(ContiguousReduce, shape, strides);
-    } else if (strides.back() > 1) {
-      return ReductionPlan(ContiguousStridedReduce, shape, strides);
-    }
-  }
-
-  // Let's check if we can optimize our access patterns
-  //
-  // 1. We have a reduction axis with stride 1. Simply call
-  //    GeneralContiguousReduce and be done with it.
-  // 2. We have transpositions and we are not reducing over the axis with
-  //    stride 1. However, we are reducing over an axis where everything is
-  //    contiguous in memory to the right of that axis. We can call strided
-  //    reduce and be done with it.
-  // 2. We have weird transpositions and expands. Copy the strides to the
-  //    output, then call strided reduce.
-
-  // Sort reduction axes by stride in order to merge them and figure out if we
-  // have a contiguous reduction.
-  std::vector<std::pair<int, int64_t>> reductions;
-  for (auto a : axes) {
-    if (x.shape(a) > 1) {
-      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
-    }
-  }
-  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
-    bool a_is_zero = a.second == 0;
-    bool b_is_zero = b.second == 0;
-    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
-  });
-  // Extract the two smallest and try to merge them in case the contiguous
-  // reduction can be bigger than just the last axis.
-  for (int i = reductions.size() - 1; i >= 1; i--) {
-    auto a = reductions[i];
-    auto b = reductions[i - 1];
-
-    // b.stride = a.shape * a.stride then a and b are contiguous
-    if (b.second == a.first * a.second) {
-      reductions.erase(reductions.begin() + i);
-      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
-    }
-  }
-
-  Shape shape;
-  Strides strides;
-  for (auto r : reductions) {
-    shape.push_back(r.first);
-    strides.push_back(r.second);
-  }
-
-  // We can call the contiguous reduction op for every weird way the input is
-  // structured in the rest of the axes.
-  if (strides.back() == 1) {
-    return ReductionPlan(GeneralContiguousReduce, shape, strides);
-  }
-
-  // Delegate to the general strided reduction op if the axes after
-  // strides.back() are contiguous.
-  if (strides.back() > 1) {
-    int64_t size = 1;
-    bool have_expand = false;
-    for (int i = x.ndim() - 1; i >= 0; i--) {
-      if (axes.back() == i) {
-        continue;
+    case Reduce::Sum:
+    case Reduce::Prod: {
+      switch (in.dtype()) {
+        case bool_:
+        case uint8:
+        case int8:
+          reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+        case uint16:
+          reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+        case uint32:
+          reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+        case uint64:
+          reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_sum_prod<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_sum_prod<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_sum_prod<float>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_sum_prod<complex64_t>(in, out, reduce_type_, axes_);
+          break;
      }
-
-      auto stride_i = x.strides()[i];
-      auto shape_i = x.shape(i);
-      if (stride_i == 0) {
-        if (shape_i == 1) {
-          continue;
-        }
-
-        have_expand = true;
-        break;
-      }
-
-      if (stride_i != size && shape_i != 1) {
-        break;
-      }
-      size *= shape_i;
+      break;
    }
-    // In the case of an expanded dimension we are being conservative and
-    // require the smallest reduction stride to be smaller than the maximum row
-    // contiguous size. The reason is that we can't easily know if the reduced
-    // axis is before or after an expanded dimension.
-    if (size > strides.back() || (size == strides.back() && !have_expand)) {
-      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    case Reduce::Max:
+    case Reduce::Min: {
+      switch (in.dtype()) {
+        case bool_:
+          reduce_dispatch_min_max<bool>(in, out, reduce_type_, axes_);
+          break;
+        case uint8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint32:
+          reduce_dispatch_min_max<uint32_t>(in, out, reduce_type_, axes_);
+          break;
+        case uint64:
+          reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
+          break;
+        case int8:
+          reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
+          break;
+        case int16:
+          reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
+          break;
+        case int32:
+          reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
+          break;
+        case int64:
+          reduce_dispatch_min_max<int64_t>(in, out, reduce_type_, axes_);
+          break;
+        case float16:
+          reduce_dispatch_min_max<float16_t>(in, out, reduce_type_, axes_);
+          break;
+        case float32:
+          reduce_dispatch_min_max<float>(in, out, reduce_type_, axes_);
+          break;
+        case bfloat16:
+          reduce_dispatch_min_max<bfloat16_t>(in, out, reduce_type_, axes_);
+          break;
+        case complex64:
+          reduce_dispatch_min_max<complex64_t>(in, out, reduce_type_, axes_);
+          break;
+      }
+      break;
    }
  }
-
-  return ReductionPlan(GeneralReduce, shape, strides);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -48,8 +48,186 @@ struct ReductionPlan {

 ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);

+// Helper for the ndimensional strided loop
+// Should this be in utils?
+void nd_loop(
+    std::function<void(int)> callback,
+    const Shape& shape,
+    const Strides& strides);
+
 std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);

+template <typename T, typename U, typename Op>
+struct DefaultStridedReduce {
+  Op op;
+
+  DefaultStridedReduce(Op op_) : op(op_) {}
+
+  void operator()(const T* x, U* accumulator, int size, size_t stride) {
+    for (int i = 0; i < size; i++) {
+      U* moving_accumulator = accumulator;
+      for (int j = 0; j < stride; j++) {
+        op(moving_accumulator, *x);
+        moving_accumulator++;
+        x++;
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultContiguousReduce {
+  Op op;
+
+  DefaultContiguousReduce(Op op_) : op(op_) {}
+
+  void operator()(const T* x, U* accumulator, int size) {
+    while (size-- > 0) {
+      op(accumulator, *x);
+      x++;
+    }
+  }
+};
+
+template <typename T, typename U, typename OpS, typename OpC, typename Op>
+void reduction_op(
+    const array& x,
+    array& out,
+    const std::vector<int>& axes,
+    U init,
+    OpS ops,
+    OpC opc,
+    Op op) {
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  ReductionPlan plan = get_reduction_plan(x, axes);
+
+  if (plan.type == ContiguousAllReduce) {
+    U* out_ptr = out.data<U>();
+    *out_ptr = init;
+    opc(x.data<T>(), out_ptr, x.size());
+    return;
+  }
+
+  if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
+    int reduction_size = plan.shape[0];
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    for (int i = 0; i < out.size(); i++, out_ptr++, x_ptr += reduction_size) {
+      *out_ptr = init;
+      opc(x_ptr, out_ptr, reduction_size);
+    }
+    return;
+  }
+
+  if (plan.type == GeneralContiguousReduce || plan.type == ContiguousReduce) {
+    int reduction_size = plan.shape.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    // Unrolling the following loop (and implementing it in order for
+    // ContiguousReduce) should hold extra performance boost.
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    if (plan.shape.size() == 0) {
+      for (int i = 0; i < out.size(); i++, out_ptr++) {
+        int offset = elem_to_loc(i, shape, strides);
+        *out_ptr = init;
+        opc(x_ptr + offset, out_ptr, reduction_size);
+      }
+    } else {
+      for (int i = 0; i < out.size(); i++, out_ptr++) {
+        int offset = elem_to_loc(i, shape, strides);
+        *out_ptr = init;
+        nd_loop(
+            [&](int extra_offset) {
+              opc(x_ptr + offset + extra_offset, out_ptr, reduction_size);
+            },
+            plan.shape,
+            plan.strides);
+      }
+    }
+    return;
+  }
+
+  if (plan.type == ContiguousStridedReduce && plan.shape.size() == 1) {
+    int reduction_size = plan.shape.back();
+    size_t reduction_stride = plan.strides.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    for (int i = 0; i < out.size(); i += reduction_stride) {
+      std::fill_n(out_ptr, reduction_stride, init);
+      ops(x_ptr, out_ptr, reduction_size, reduction_stride);
+      x_ptr += reduction_stride * reduction_size;
+      out_ptr += reduction_stride;
+    }
+    return;
+  }
+
+  if (plan.type == GeneralStridedReduce ||
+      plan.type == ContiguousStridedReduce) {
+    int reduction_size = plan.shape.back();
+    size_t reduction_stride = plan.strides.back();
+    plan.shape.pop_back();
+    plan.strides.pop_back();
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    if (plan.shape.size() == 0) {
+      for (int i = 0; i < out.size(); i += reduction_stride) {
+        int offset = elem_to_loc(i, shape, strides);
+        std::fill_n(out_ptr, reduction_stride, init);
+        ops(x_ptr + offset, out_ptr, reduction_size, reduction_stride);
+        out_ptr += reduction_stride;
+      }
+    } else {
+      for (int i = 0; i < out.size(); i += reduction_stride) {
+        int offset = elem_to_loc(i, shape, strides);
+        std::fill_n(out_ptr, reduction_stride, init);
+        nd_loop(
+            [&](int extra_offset) {
+              ops(x_ptr + offset + extra_offset,
+                  out_ptr,
+                  reduction_size,
+                  reduction_stride);
+            },
+            plan.shape,
+            plan.strides);
+        out_ptr += reduction_stride;
+      }
+    }
+    return;
+  }
+
+  if (plan.type == GeneralReduce) {
+    const T* x_ptr = x.data<T>();
+    U* out_ptr = out.data<U>();
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
+    for (int i = 0; i < out.size(); i++, out_ptr++) {
+      int offset = elem_to_loc(i, shape, strides);
+      U val = init;
+      nd_loop(
+          [&](int extra_offset) { op(&val, *(x_ptr + offset + extra_offset)); },
+          plan.shape,
+          plan.strides);
+      *out_ptr = val;
+    }
+  }
+}
+
+template <typename T, typename U, typename Op>
+void reduction_op(
+    const array& x,
+    array& out,
+    const std::vector<int>& axes,
+    U init,
+    Op op) {
+  DefaultStridedReduce<T, U, Op> ops(op);
+  DefaultContiguousReduce<T, U, Op> opc(op);
+  reduction_op<T, U>(x, out, axes, init, ops, opc, op);
+}
+
 } // namespace mlx::core
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -0,0 +1,147 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/reduce.h"
+
+namespace mlx::core {
+
+std::pair<Shape, Strides> shapes_without_reduction_axes(
+    const array& x,
+    const std::vector<int>& axes) {
+  auto shape = x.shape();
+  auto strides = x.strides();
+
+  for (int i = axes.size() - 1; i >= 0; i--) {
+    int a = axes[i];
+    shape.erase(shape.begin() + a);
+    strides.erase(strides.begin() + a);
+  }
+
+  return std::make_pair(shape, strides);
+}
+
+ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
+  // The data is all there and we are reducing over everything
+  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
+      x.flags().contiguous) {
+    return ContiguousAllReduce;
+  }
+
+  // Row contiguous input so the output is row contiguous
+  if (x.flags().row_contiguous) {
+    // Merge consecutive axes
+    Shape shape = {x.shape(axes[0])};
+    Strides strides = {x.strides()[axes[0]]};
+    for (int i = 1; i < axes.size(); i++) {
+      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
+        shape.back() *= x.shape(axes[i]);
+        strides.back() = x.strides()[axes[i]];
+      } else {
+        shape.push_back(x.shape(axes[i]));
+        strides.push_back(x.strides()[axes[i]]);
+      }
+    }
+
+    // Remove singleton axes from the plan
+    for (int i = shape.size() - 1; i >= 0; i--) {
+      if (shape[i] == 1) {
+        shape.erase(shape.begin() + i);
+        strides.erase(strides.begin() + i);
+      }
+    }
+
+    if (strides.back() == 1) {
+      return ReductionPlan(ContiguousReduce, shape, strides);
+    } else if (strides.back() > 1) {
+      return ReductionPlan(ContiguousStridedReduce, shape, strides);
+    }
+  }
+
+  // Let's check if we can optimize our access patterns
+  //
+  // 1. We have a reduction axis with stride 1. Simply call
+  //    GeneralContiguousReduce and be done with it.
+  // 2. We have transpositions and we are not reducing over the axis with
+  //    stride 1. However, we are reducing over an axis where everything is
+  //    contiguous in memory to the right of that axis. We can call strided
+  //    reduce and be done with it.
+  // 2. We have weird transpositions and expands. Copy the strides to the
+  //    output, then call strided reduce.
+
+  // Sort reduction axes by stride in order to merge them and figure out if we
+  // have a contiguous reduction.
+  std::vector<std::pair<int, int64_t>> reductions;
+  for (auto a : axes) {
+    if (x.shape(a) > 1) {
+      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
+    }
+  }
+  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
+    bool a_is_zero = a.second == 0;
+    bool b_is_zero = b.second == 0;
+    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
+  });
+  // Extract the two smallest and try to merge them in case the contiguous
+  // reduction can be bigger than just the last axis.
+  for (int i = reductions.size() - 1; i >= 1; i--) {
+    auto a = reductions[i];
+    auto b = reductions[i - 1];
+
+    // b.stride = a.shape * a.stride then a and b are contiguous
+    if (b.second == a.first * a.second) {
+      reductions.erase(reductions.begin() + i);
+      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
+    }
+  }
+
+  Shape shape;
+  Strides strides;
+  for (auto r : reductions) {
+    shape.push_back(r.first);
+    strides.push_back(r.second);
+  }
+
+  // We can call the contiguous reduction op for every weird way the input is
+  // structured in the rest of the axes.
+  if (strides.back() == 1) {
+    return ReductionPlan(GeneralContiguousReduce, shape, strides);
+  }
+
+  // Delegate to the general strided reduction op if the axes after
+  // strides.back() are contiguous.
+  if (strides.back() > 1) {
+    int64_t size = 1;
+    bool have_expand = false;
+    for (int i = x.ndim() - 1; i >= 0; i--) {
+      if (axes.back() == i) {
+        continue;
+      }
+
+      auto stride_i = x.strides()[i];
+      auto shape_i = x.shape(i);
+      if (stride_i == 0) {
+        if (shape_i == 1) {
+          continue;
+        }
+
+        have_expand = true;
+        break;
+      }
+
+      if (stride_i != size && shape_i != 1) {
+        break;
+      }
+      size *= shape_i;
+    }
+    // In the case of an expanded dimension we are being conservative and
+    // require the smallest reduction stride to be smaller than the maximum row
+    // contiguous size. The reason is that we can't easily know if the reduced
+    // axis is before or after an expanded dimension.
+    if (size > strides.back() || (size == strides.back() && !have_expand)) {
+      return ReductionPlan(GeneralStridedReduce, shape, strides);
+    }
+  }
+
+  return ReductionPlan(GeneralReduce, shape, strides);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/scan.cpp
+++ b/mlx/backend/common/scan.cpp
@@ -0,0 +1,325 @@
+// Copyright © 2023 Apple Inc.
+
+#include <cassert>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename U, typename Op>
+struct DefaultContiguousScan {
+  Op op;
+  U init;
+
+  DefaultContiguousScan(Op op_, U init_) : op(op_), init(init_) {}
+
+  void operator()(
+      const T* input,
+      U* output,
+      int count,
+      int stride,
+      bool reverse,
+      bool inclusive) {
+    if (!reverse) {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          *output = *input;
+          for (int j = 1; j < stride; j++) {
+            input++;
+            output++;
+            op(output, output - 1, input);
+          }
+          output++;
+          input++;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          *output = init;
+          for (int j = 1; j < stride; j++) {
+            op(output + 1, output, input);
+            input++;
+            output++;
+          }
+          output++;
+          input++;
+        }
+      }
+    } else {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          output += stride - 1;
+          input += stride - 1;
+          *output = *input;
+          for (int j = 1; j < stride; j++) {
+            input--;
+            output--;
+            op(output, output + 1, input);
+          }
+          output += stride;
+          input += stride;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          output += stride - 1;
+          input += stride - 1;
+          *output = init;
+          for (int j = 1; j < stride; j++) {
+            op(output - 1, output, input);
+            input--;
+            output--;
+          }
+          output += stride;
+          input += stride;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename Op>
+struct DefaultStridedScan {
+  Op op;
+  U init;
+
+  DefaultStridedScan(Op op_, U init_) : op(op_), init(init_) {}
+
+  void operator()(
+      const T* input,
+      U* output,
+      int count,
+      int size,
+      int stride,
+      bool reverse,
+      bool inclusive) {
+    // TODO: Vectorize the following naive implementation
+    if (!reverse) {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          std::copy(input, input + stride, output);
+          output += stride;
+          input += stride;
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              op(output, output - stride, input);
+              output++;
+              input++;
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          std::fill(output, output + stride, init);
+          output += stride;
+          input += stride;
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              op(output, output - stride, input - stride);
+              output++;
+              input++;
+            }
+          }
+        }
+      }
+    } else {
+      if (inclusive) {
+        for (int i = 0; i < count; i++) {
+          output += (size - 1) * stride;
+          input += (size - 1) * stride;
+          std::copy(input, input + stride, output);
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              output--;
+              input--;
+              op(output, output + stride, input);
+            }
+          }
+          output += size * stride;
+          input += size * stride;
+        }
+      } else {
+        for (int i = 0; i < count; i++) {
+          output += (size - 1) * stride;
+          input += (size - 1) * stride;
+          std::fill(output, output + stride, init);
+          for (int j = 1; j < size; j++) {
+            for (int k = 0; k < stride; k++) {
+              output--;
+              input--;
+              op(output, output + stride, input + stride);
+            }
+          }
+          output += size * stride;
+          input += size * stride;
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename U, typename OpCS, typename OpSS>
+void scan_op(
+    OpCS opcs,
+    OpSS opss,
+    const array& input,
+    array& output,
+    int axis,
+    bool reverse,
+    bool inclusive) {
+  output.set_data(allocator::malloc_or_wait(output.nbytes()));
+
+  if (input.flags().row_contiguous) {
+    if (input.strides()[axis] == 1) {
+      opcs(
+          input.data<T>(),
+          output.data<U>(),
+          input.size() / input.shape(axis),
+          input.shape(axis),
+          reverse,
+          inclusive);
+    } else {
+      opss(
+          input.data<T>(),
+          output.data<U>(),
+          input.size() / input.shape(axis) / input.strides()[axis],
+          input.shape(axis),
+          input.strides()[axis],
+          reverse,
+          inclusive);
+    }
+  } else {
+    throw std::runtime_error("Scan op supports only contiguous inputs");
+  }
+}
+
+template <typename T, typename U>
+void scan_dispatch(
+    Scan::ReduceType rtype,
+    const array& input,
+    array& output,
+    int axis,
+    bool reverse,
+    bool inclusive) {
+  switch (rtype) {
+    case Scan::Sum: {
+      auto op = [](U* o, const U* y, const T* x) { *o = *y + *x; };
+      auto init = static_cast<U>(0);
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Prod: {
+      auto op = [](U* o, const U* y, const T* x) { *o = *y * (*x); };
+      auto init = static_cast<U>(1);
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Min: {
+      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *x : *y; };
+      auto init = (issubdtype(input.dtype(), floating))
+          ? static_cast<U>(std::numeric_limits<float>::infinity())
+          : std::numeric_limits<U>::max();
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+    case Scan::Max: {
+      auto op = [](U* o, const U* y, const T* x) { *o = (*x < *y) ? *y : *x; };
+      auto init = (issubdtype(input.dtype(), floating))
+          ? static_cast<U>(-std::numeric_limits<float>::infinity())
+          : std::numeric_limits<U>::min();
+      auto opcs = DefaultContiguousScan<T, U, decltype(op)>(op, init);
+      auto opss = DefaultStridedScan<T, U, decltype(op)>(op, init);
+      scan_op<T, U>(opcs, opss, input, output, axis, reverse, inclusive);
+      break;
+    }
+  }
+}
+
+} // namespace
+
+void Scan::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  // Ensure contiguity
+  auto in = inputs[0];
+  if (!in.flags().row_contiguous) {
+    array arr_copy(in.shape(), in.dtype(), nullptr, {});
+    copy(in, arr_copy, CopyType::General);
+    in = arr_copy;
+  }
+
+  switch (in.dtype()) {
+    case bool_: {
+      // We could do a full dtype x dtype switch but this is the only case
+      // where we accumulate in a different type, for now.
+      //
+      // TODO: If we add the option to accumulate floats in higher precision
+      //       floats perhaps we should add the full all-to-all dispatch.
+      if (reduce_type_ == Scan::Sum && out.dtype() == int32) {
+        scan_dispatch<bool, int32_t>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
+      } else {
+        scan_dispatch<bool, bool>(
+            reduce_type_, in, out, axis_, reverse_, inclusive_);
+      }
+      break;
+    }
+    case uint8:
+      scan_dispatch<uint8_t, uint8_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint16:
+      scan_dispatch<uint16_t, uint16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint32:
+      scan_dispatch<uint32_t, uint32_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case uint64:
+      scan_dispatch<uint64_t, uint64_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int8:
+      scan_dispatch<int8_t, int8_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int16:
+      scan_dispatch<int16_t, int16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int32:
+      scan_dispatch<int32_t, int32_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case int64:
+      scan_dispatch<int64_t, int64_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case float16:
+      scan_dispatch<float16_t, float16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case float32:
+      scan_dispatch<float, float>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case bfloat16:
+      scan_dispatch<bfloat16_t, bfloat16_t>(
+          reduce_type_, in, out, axis_, reverse_, inclusive_);
+      break;
+    case complex64:
+      throw std::runtime_error("Scan ops do not support complex types yet");
+      break;
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/select.cpp
+++ b/mlx/backend/common/select.cpp
@@ -2,8 +2,7 @@

 #include <cassert>

-#include "mlx/backend/cpu/binary_ops.h"
-#include "mlx/backend/cpu/ternary.h"
+#include "mlx/backend/common/ternary.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -62,7 +61,7 @@ void select_op(

 } // namespace

-void Select::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Select::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);
  const auto& condition = inputs[0];
  const auto& a = inputs[1];
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -35,29 +35,4 @@ void shared_buffer_slice(
  move_or_copy(in, out, out_strides, flags, data_size, data_offset);
 }

-void slice(
-    const array& in,
-    array& out,
-    const Shape& start_indices,
-    const Shape& strides) {
-  if (out.size() == 0) {
-    out.set_data(nullptr);
-    return;
-  }
-
-  // Calculate out strides, initial offset
-  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);
-  int64_t data_end = 1;
-  for (int i = 0; i < start_indices.size(); ++i) {
-    if (in.shape()[i] > 1) {
-      auto end_idx = start_indices[i] + out.shape()[i] * strides[i] - 1;
-      data_end += end_idx * in.strides()[i];
-    }
-  }
-  // data_end can be -1
-  size_t data_size =
-      data_end < 0 ? (data_offset - data_end) : (data_end - data_offset);
-  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
-}
-
 } // namespace mlx::core
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -11,10 +11,11 @@ std::tuple<int64_t, Strides> prepare_slice(
    const Shape& start_indices,
    const Shape& strides);

-void slice(
+void shared_buffer_slice(
    const array& in,
-    array& out,
-    const Shape& start_indices,
-    const Shape& strides);
+    const Strides& out_strides,
+    size_t data_offset,
+    size_t data_size,
+    array& out);

 } // namespace mlx::core
--- a/mlx/backend/common/softmax.cpp
+++ b/mlx/backend/common/softmax.cpp
@@ -3,108 +3,62 @@
 #include <cassert>
 #include <cmath>

-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/simd/simd.h"
+#include "mlx/backend/common/copy.h"
 #include "mlx/primitives.h"

 namespace mlx::core {

 namespace {

-using namespace mlx::core::simd;
-
 template <typename T, typename AccT>
 void softmax(const array& in, array& out) {
-  constexpr bool same_t = std::is_same_v<T, AccT>;
-  constexpr int N = std::min(max_size<AccT>, max_size<T>);
-
  const T* in_ptr = in.data<T>();
  T* out_ptr = out.data<T>();
-  int M = in.shape().back();
-  int L = in.data_size() / M;
+  int N = in.shape().back();
+  int M = in.data_size() / N;
  const T* current_in_ptr;
  T* current_out_ptr;

-  for (int i = 0; i < L; i++, in_ptr += M, out_ptr += M) {
+  for (int i = 0; i < M; i++, in_ptr += N, out_ptr += N) {
    // Find the maximum
    current_in_ptr = in_ptr;
-    Simd<AccT, N> vmaximum(-std::numeric_limits<float>::infinity());
-    size_t s = M;
-    while (s >= N) {
-      Simd<AccT, N> vals = load<T, N>(current_in_ptr);
-      vmaximum = maximum(vals, vmaximum);
-      current_in_ptr += N;
-      s -= N;
-    }
-
-    AccT maximum = max(vmaximum);
-    while (s-- > 0) {
-      maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
-      current_in_ptr++;
+    AccT maximum = *current_in_ptr;
+    for (int j = 0; j < N; j++, current_in_ptr++) {
+      maximum = (maximum < *current_in_ptr) ? static_cast<AccT>(*current_in_ptr)
+                                            : maximum;
    }

    // Compute the normalizer and the exponentials
-    Simd<AccT, N> vnormalizer(0.0);
+    AccT normalizer = 0;
    current_out_ptr = out_ptr;
    current_in_ptr = in_ptr;
-    s = M;
-    while (s >= N) {
-      Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
-      vexp = exp(vexp - maximum);
-      if constexpr (same_t) {
-        store(current_out_ptr, vexp);
+    for (int j = 0; j < N; j++, current_out_ptr++, current_in_ptr++) {
+      AccT expv = std::exp(*current_in_ptr - maximum);
+      normalizer += expv;
+      if constexpr (std::is_same<T, AccT>::value) {
+        *current_out_ptr = expv;
      }
-      vnormalizer = vnormalizer + vexp;
-      current_in_ptr += N;
-      current_out_ptr += N;
-      s -= N;
-    }
-    AccT normalizer = sum(vnormalizer);
-    while (s-- > 0) {
-      AccT _exp = std::exp(*current_in_ptr - maximum);
-      if constexpr (same_t) {
-        *current_out_ptr = _exp;
-      }
-      normalizer += _exp;
-      current_in_ptr++;
-      current_out_ptr++;
    }
    normalizer = 1 / normalizer;

    // Normalize
-    current_out_ptr = out_ptr;
    current_in_ptr = in_ptr;
-    s = M;
-    while (s >= N) {
-      if constexpr (same_t) {
-        store(
-            current_out_ptr,
-            Simd<T, N>(load<T, N>(current_out_ptr) * normalizer));
-      } else {
-        Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
-        vexp = exp(vexp - maximum) * normalizer;
-        store(current_out_ptr, Simd<T, N>(vexp));
-        current_in_ptr += N;
-      }
-      current_out_ptr += N;
-      s -= N;
-    }
-    while (s-- > 0) {
-      if constexpr (same_t) {
+    current_out_ptr = out_ptr;
+    for (int j = 0; j < N; j++, current_out_ptr++) {
+      if constexpr (std::is_same<T, AccT>::value) {
        *current_out_ptr *= normalizer;
      } else {
-        AccT _exp = std::exp(*current_in_ptr - maximum);
-        *current_out_ptr = static_cast<T>(_exp * normalizer);
+        auto v = std::exp(*current_in_ptr - maximum);
+        *current_out_ptr = static_cast<T>(v * normalizer);
        current_in_ptr++;
      }
-      current_out_ptr++;
    }
  }
 }

 } // namespace

-void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Softmax::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  // Make sure that the last dimension is contiguous
@@ -143,7 +97,7 @@ void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
    case int16:
    case int32:
    case int64:
-      throw std::runtime_error(
+      throw std::invalid_argument(
          "Softmax is defined only for floating point types");
      break;
    case float32:
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -5,8 +5,8 @@
 #include <cmath>
 #include <numeric>

+#include "mlx/backend/common/copy.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/copy.h"

 #include "mlx/primitives.h"

@@ -14,10 +14,10 @@ namespace mlx::core {

 namespace {

-template <typename T>
+template <typename T, typename IdxT = int32_t>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
-  using difference_type = int32_t;
+  using difference_type = IdxT;
  using value_type = T;
  using reference = value_type&;
  using pointer = value_type*;
@@ -287,7 +287,7 @@ void argpartition(const array& in, array& out, int axis, int kth) {

 } // namespace

-void ArgSort::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgSort::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -321,7 +321,7 @@ void ArgSort::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Sort::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -355,7 +355,7 @@ void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
+void ArgPartition::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

@@ -389,7 +389,7 @@ void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
  }
 }

-void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
+void Partition::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

--- a/mlx/backend/common/svd.cpp
+++ b/mlx/backend/common/svd.cpp
@@ -1,8 +1,8 @@
 // Copyright © 2024 Apple Inc.

 #include "mlx/allocator.h"
-#include "mlx/backend/cpu/copy.h"
-#include "mlx/backend/cpu/lapack.h"
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/lapack.h"
 #include "mlx/primitives.h"

 namespace mlx::core {
@@ -137,9 +137,7 @@ void svd_impl(const array& a, array& u, array& s, array& vt) {
  }
 }

-void SVD::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+void SVD::eval(const std::vector<array>& inputs, std::vector<array>& outputs) {
  if (!(inputs[0].dtype() == float32)) {
    throw std::runtime_error("[SVD::eval] only supports float32.");
  }
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -3,10 +3,12 @@
 #pragma once
 #include "mlx/allocator.h"
 #include "mlx/array.h"
+#include "mlx/backend/common/ops.h"
 #include "mlx/backend/common/utils.h"
-
 namespace mlx::core {

+namespace {
+
 // TODO: Add support for more combinations of input types.
 enum class TernaryOpType {
  ScalarScalarScalar,
@@ -14,7 +16,7 @@ enum class TernaryOpType {
  General,
 };

-inline TernaryOpType
+TernaryOpType
 get_ternary_op_type(const array& a, const array& b, const array& c) {
  TernaryOpType topt;
  if (a.data_size() == 1 && b.data_size() == 1 && c.data_size() == 1) {
@@ -31,7 +33,7 @@ get_ternary_op_type(const array& a, const array& b, const array& c) {
  return topt;
 }

-inline void set_ternary_op_output_data(
+void set_ternary_op_output_data(
    const array& a,
    const array& b,
    const array& c,
@@ -65,14 +67,156 @@ inline void set_ternary_op_output_data(
      }
      break;
    case TernaryOpType::General:
-      // Try to donate an input which is row_contiguous
-      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
-            (b.flags().row_contiguous && maybe_donate(b)) ||
-            (c.flags().row_contiguous && maybe_donate(c)))) {
-        out.set_data(allocator::malloc_or_wait(out.nbytes()));
-      }
+      out.set_data(allocator::malloc_or_wait(out.nbytes()));
      break;
  }
 }
+template <typename T1, typename T2, typename T3, typename U, typename Op, int D>
+void ternary_op_dims(
+    const T1* a,
+    const T2* b,
+    const T3* c,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& c_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_c = c_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];
+
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      ternary_op_dims<T1, T2, T3, U, Op, D - 1>(
+          a,
+          b,
+          c,
+          out,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          axis + 1);
+    } else {
+      *out = op(*a, *b, *c);
+    }
+    a += stride_a;
+    b += stride_b;
+    c += stride_c;
+    out += stride_out;
+  }
+}
+
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op_dispatch_dims(
+    const array& a,
+    const array& b,
+    const array& c,
+    array& out,
+    Op op) {
+  auto [shape, strides] = collapse_contiguous_dims(
+      a.shape(), {a.strides(), b.strides(), c.strides(), out.strides()});
+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& c_strides = strides[2];
+  const auto& out_strides = strides[3];
+
+  const T1* a_ptr = a.data<T1>();
+  const T2* b_ptr = b.data<T2>();
+  const T3* c_ptr = c.data<T3>();
+  U* out_ptr = out.data<T3>();
+  int ndim = shape.size();
+  switch (ndim) {
+    case 1:
+      ternary_op_dims<T1, T2, T3, U, Op, 1>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      ternary_op_dims<T1, T2, T3, U, Op, 2>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+  }
+
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  ContiguousIterator c_it(shape, c_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
+  for (size_t elem = 0; elem < a.size(); elem += stride) {
+    ternary_op_dims<T1, T2, T3, U, Op, 2>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        c_ptr + c_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        c_strides,
+        out_strides,
+        ndim - 2);
+    a_it.step();
+    b_it.step();
+    c_it.step();
+  }
+}
+
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op(
+    const array& a,
+    const array& b,
+    const array& c,
+    array& out,
+    Op op) {
+  TernaryOpType topt = get_ternary_op_type(a, b, c);
+  set_ternary_op_output_data(a, b, c, out, topt);
+
+  // The full computation is scalar-scalar-scalar so we call the base op once.
+  if (topt == TernaryOpType::ScalarScalarScalar) {
+    *(out.data<U>()) = op(*a.data<T1>(), *b.data<T2>(), *c.data<T3>());
+  } else if (topt == TernaryOpType::VectorVectorVector) {
+    const T1* a_ptr = a.data<T1>();
+    const T2* b_ptr = b.data<T2>();
+    const T3* c_ptr = c.data<T3>();
+    U* out_ptr = out.data<U>();
+    for (size_t i = 0; i < out.size(); ++i) {
+      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
+      a_ptr++;
+      b_ptr++;
+      c_ptr++;
+      out_ptr++;
+    }
+  } else {
+    ternary_op_dispatch_dims<T1, T2, T3, U>(a, b, c, out, op);
+  }
+}
+
+} // namespace

 } // namespace mlx::core
--- a/mlx/backend/common/threefry.cpp
+++ b/mlx/backend/common/threefry.cpp
@@ -1,6 +1,6 @@
 // Copyright © 2023 Apple Inc.

-#include "mlx/backend/cpu/threefry.h"
+#include "mlx/backend/common/threefry.h"

 namespace mlx::core::random {

--- a/mlx/backend/common/threefry.h
+++ b/mlx/backend/common/threefry.h
--- a/mlx/backend/common/unary.h
+++ b/mlx/backend/common/unary.h
@@ -5,11 +5,12 @@
 #include "mlx/allocator.h"
 #include "mlx/array.h"
 #include "mlx/backend/common/utils.h"
-#include "mlx/backend/cpu/simd/simd.h"
 #include "mlx/utils.h"

 namespace mlx::core {

+namespace {
+
 void set_unary_output_data(const array& in, array& out) {
  if (is_donatable(in, out)) {
    out.copy_shared_buffer(in);
@@ -37,19 +38,8 @@ void unary_op(const array& a, array& out, Op op) {
  if (a.flags().contiguous) {
    set_unary_output_data(a, out);
    U* dst = out.data<U>();
-    constexpr int N = simd::max_size<T>;
-    size_t size = a.data_size();
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a_ptr)));
-      size -= N;
-      a_ptr += N;
-      dst += N;
-    }
-    while (size > 0) {
-      *dst = op(*a_ptr);
-      size--;
-      dst++;
-      a_ptr++;
+    for (size_t i = 0; i < a.data_size(); ++i) {
+      dst[i] = op(a_ptr[i]);
    }
  } else {
    out.set_data(allocator::malloc_or_wait(out.nbytes()));
@@ -135,4 +125,6 @@ void unary_fp(const array& a, array& out, Op op) {
  }
 }

+} // namespace
+
 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -107,7 +107,7 @@ struct ContiguousIterator {
      : shape_(a.shape()), strides_(a.strides()) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
-      pos_ = Shape(shape_.size(), 0);
+      pos_ = std::vector<int>(shape_.size(), 0);
    }
  }

@@ -168,10 +168,4 @@ void move_or_copy(
    size_t data_size,
    size_t offset = 0);

-std::pair<bool, Strides> prepare_reshape(const array& in, const array& out);
-
-void shared_buffer_reshape(
-    const array& in,
-    const Strides& out_strides,
-    array& out);
 } // namespace mlx::core
--- a/mlx/backend/cpu/CMakeLists.txt
+++ b/mlx/backend/cpu/CMakeLists.txt
@@ -1,81 +0,0 @@
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-  set(COMPILER ${CMAKE_C_COMPILER})
-  set(CLANG TRUE)
-else()
-  set(COMPILER ${CMAKE_CXX_COMPILER})
-endif()
-
-set(COMPILE_DEPS
-    ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
-    ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
-    simd/simd.h
-    simd/base_simd.h
-    simd/math.h
-    simd/type.h
-    unary_ops.h
-    binary_ops.h)
-
-if(MSVC)
-  set(SHELL_EXT ps1)
-  set(SHELL_CMD powershell -ExecutionPolicy Bypass -File)
-else()
-  set(SHELL_EXT sh)
-  set(SHELL_CMD bash)
-endif()
-
-add_custom_command(
-  OUTPUT compiled_preamble.cpp
-  COMMAND
-    ${SHELL_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.${SHELL_EXT}
-    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
-    ${PROJECT_SOURCE_DIR} ${CLANG} ${CMAKE_SYSTEM_PROCESSOR}
-  DEPENDS make_compiled_preamble.${SHELL_EXT} compiled_preamble.h
-          ${COMPILE_DEPS})
-
-add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)
-
-add_dependencies(mlx cpu_compiled_preamble)
-
-target_sources(
-  mlx
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cblas.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
-          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)
-
-if(MLX_BUILD_ACCELERATE)
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/bnns.cpp)
-else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_fp16.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/no_bf16.cpp)
-endif()
-
-if(IOS)
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../no_cpu/compiled.cpp)
-else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
-                             ${CMAKE_CURRENT_SOURCE_DIR}/jit_compiler.cpp)
-endif()
--- a/mlx/backend/cpu/binary.h
+++ b/mlx/backend/cpu/binary.h
@@ -1,370 +0,0 @@
-// Copyright © 2023 Apple Inc.
-
-#pragma once
-#include <cassert>
-
-#include "mlx/allocator.h"
-#include "mlx/array.h"
-#include "mlx/backend/common/binary.h"
-#include "mlx/backend/common/utils.h"
-
-#include "mlx/backend/cpu/simd/simd.h"
-
-namespace mlx::core {
-
-template <typename Op>
-struct VectorScalar {
-  Op op;
-
-  VectorScalar(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    T scalar = *b;
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::Simd<T, N>(scalar)));
-      dst += N;
-      a += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(*a, scalar);
-      dst++;
-      a++;
-    }
-  }
-};
-
-template <typename Op>
-struct ScalarVector {
-  Op op;
-
-  ScalarVector(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    T scalar = *a;
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::Simd<T, N>(scalar), simd::load<T, N>(b)));
-      dst += N;
-      b += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(scalar, *b);
-      dst++;
-      b++;
-    }
-  }
-};
-
-template <typename Op>
-struct VectorVector {
-  Op op;
-
-  VectorVector(Op op_) : op(op_) {}
-
-  template <typename T, typename U>
-  void operator()(const T* a, const T* b, U* dst, int size) {
-    constexpr int N = simd::max_size<T>;
-    while (size >= N) {
-      simd::store(dst, op(simd::load<T, N>(a), simd::load<T, N>(b)));
-      dst += N;
-      a += N;
-      b += N;
-      size -= N;
-    }
-    while (size-- > 0) {
-      *dst = op(*a, *b);
-      dst++;
-      a++;
-      b++;
-    }
-  }
-};
-
-template <typename T, typename U, typename Op, int D, bool Strided>
-void binary_op_dims(
-    const T* a,
-    const T* b,
-    U* out,
-    Op op,
-    const Shape& shape,
-    const Strides& a_strides,
-    const Strides& b_strides,
-    const Strides& out_strides,
-    int axis) {
-  auto stride_a = a_strides[axis];
-  auto stride_b = b_strides[axis];
-  auto stride_out = out_strides[axis];
-  auto N = shape[axis];
-
-  for (int i = 0; i < N; i++) {
-    if constexpr (D > 1) {
-      binary_op_dims<T, U, Op, D - 1, Strided>(
-          a, b, out, op, shape, a_strides, b_strides, out_strides, axis + 1);
-    } else {
-      if constexpr (Strided) {
-        op(a, b, out, stride_out);
-      } else {
-        *out = op(*a, *b);
-      }
-    }
-    out += stride_out;
-    a += stride_a;
-    b += stride_b;
-  }
-}
-
-template <typename T, typename U, bool Strided, typename Op>
-void binary_op_dispatch_dims(
-    const array& a,
-    const array& b,
-    array& out,
-    Op op,
-    int dim,
-    const Shape& shape,
-    const Strides& a_strides,
-    const Strides& b_strides,
-    const Strides& out_strides) {
-  const T* a_ptr = a.data<T>();
-  const T* b_ptr = b.data<T>();
-  U* out_ptr = out.data<U>();
-  switch (dim) {
-    case 1:
-      binary_op_dims<T, U, Op, 1, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-    case 2:
-      binary_op_dims<T, U, Op, 2, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-    case 3:
-      binary_op_dims<T, U, Op, 3, Strided>(
-          a_ptr,
-          b_ptr,
-          out_ptr,
-          op,
-          shape,
-          a_strides,
-          b_strides,
-          out_strides,
-          0);
-      return;
-  }
-
-  ContiguousIterator a_it(shape, a_strides, dim - 3);
-  ContiguousIterator b_it(shape, b_strides, dim - 3);
-  auto stride = out_strides[dim - 4];
-  for (int64_t elem = 0; elem < a.size(); elem += stride) {
-    binary_op_dims<T, U, Op, 3, Strided>(
-        a_ptr + a_it.loc,
-        b_ptr + b_it.loc,
-        out_ptr + elem,
-        op,
-        shape,
-        a_strides,
-        b_strides,
-        out_strides,
-        dim - 3);
-    a_it.step();
-    b_it.step();
-  }
-}
-
-template <typename T, typename U, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  auto bopt = get_binary_op_type(a, b);
-  set_binary_op_output_data(a, b, out, bopt);
-
-  // The full computation is scalar scalar so call the base op once
-  if (bopt == BinaryOpType::ScalarScalar) {
-    *(out.data<U>()) = op(*a.data<T>(), *b.data<T>());
-    return;
-  }
-
-  // The full computation is scalar vector so delegate to the op
-  if (bopt == BinaryOpType::ScalarVector) {
-    ScalarVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), b.data_size());
-    return;
-  }
-
-  // The full computation is vector scalar so delegate to the op
-  if (bopt == BinaryOpType::VectorScalar) {
-    VectorScalar{op}(a.data<T>(), b.data<T>(), out.data<U>(), a.data_size());
-    return;
-  }
-
-  // The full computation is vector vector so delegate to the op
-  if (bopt == BinaryOpType::VectorVector) {
-    VectorVector{op}(a.data<T>(), b.data<T>(), out.data<U>(), out.size());
-    return;
-  }
-
-  // General computation so let's try to optimize
-  auto [new_shape, new_strides] = collapse_contiguous_dims(
-      a.shape(), {a.strides(), b.strides(), out.strides()});
-  const auto& a_strides = new_strides[0];
-  const auto& b_strides = new_strides[1];
-  const auto& strides = new_strides[2];
-
-  // Get the left-most dim such that the array is row contiguous after
-  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
-    int d = arr_strides.size() - 1;
-    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
-    }
-    return d + 1;
-  };
-  auto a_rc_dim = leftmost_rc_dim(a_strides);
-  auto b_rc_dim = leftmost_rc_dim(b_strides);
-
-  // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const auto& arr_strides) {
-    int d = arr_strides.size() - 1;
-    for (; d >= 0 && arr_strides[d] == 0; d--) {
-    }
-    return d + 1;
-  };
-  auto a_s_dim = leftmost_s_dim(a_strides);
-  auto b_s_dim = leftmost_s_dim(b_strides);
-
-  auto ndim = new_shape.size();
-
-  // Case 1: LxM and FxM where L and F are broadcastable and M is row contiguous
-  int dim = ndim;
-  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::VectorVector;
-    dim = d;
-    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
-    bopt = BinaryOpType::VectorScalar;
-    dim = d;
-    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
-    // contiguous
-  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
-    bopt = BinaryOpType::ScalarVector;
-    dim = d;
-  }
-
-  // Can be sure dim > 0 since otherwise we would have used one of the fully
-  // contiguous methods above. Except for the case that the flags do not
-  // correspond to the underlying contiguity.
-  if (dim == 0 || strides[dim - 1] < 16) {
-    bopt = BinaryOpType::General;
-    dim = ndim;
-  }
-
-  switch (bopt) {
-    case BinaryOpType::VectorVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorVector{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    case BinaryOpType::VectorScalar:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          VectorScalar{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    case BinaryOpType::ScalarVector:
-      binary_op_dispatch_dims<T, U, true>(
-          a,
-          b,
-          out,
-          ScalarVector{op},
-          dim,
-          new_shape,
-          a_strides,
-          b_strides,
-          strides);
-      break;
-    default:
-      binary_op_dispatch_dims<T, U, false>(
-          a, b, out, op, dim, new_shape, a_strides, b_strides, strides);
-      break;
-  }
-}
-
-template <typename T, typename Op>
-void binary_op(const array& a, const array& b, array& out, Op op) {
-  binary_op<T, T>(a, b, out, op);
-}
-
-template <typename Op>
-void binary(const array& a, const array& b, array& out, Op op) {
-  switch (out.dtype()) {
-    case bool_:
-      binary_op<bool>(a, b, out, op);
-      break;
-    case uint8:
-      binary_op<uint8_t>(a, b, out, op);
-      break;
-    case uint16:
-      binary_op<uint16_t>(a, b, out, op);
-      break;
-    case uint32:
-      binary_op<uint32_t>(a, b, out, op);
-      break;
-    case uint64:
-      binary_op<uint64_t>(a, b, out, op);
-      break;
-    case int8:
-      binary_op<int8_t>(a, b, out, op);
-      break;
-    case int16:
-      binary_op<int16_t>(a, b, out, op);
-      break;
-    case int32:
-      binary_op<int32_t>(a, b, out, op);
-      break;
-    case int64:
-      binary_op<int64_t>(a, b, out, op);
-      break;
-    case float16:
-      binary_op<float16_t>(a, b, out, op);
-      break;
-    case float32:
-      binary_op<float>(a, b, out, op);
-      break;
-    case bfloat16:
-      binary_op<bfloat16_t>(a, b, out, op);
-      break;
-    case complex64:
-      binary_op<complex64_t>(a, b, out, op);
-      break;
-  }
-}
-
-} // namespace mlx::core
--- a/mlx/backend/cpu/binary_ops.h
+++ b/mlx/backend/cpu/binary_ops.h
@@ -1,98 +0,0 @@
-// Copyright © 2023-2024 Apple Inc.
-
-#pragma once
-
-#include "mlx/backend/cpu/simd/simd.h"
-
-namespace mlx::core::detail {
-
-using namespace mlx::core::simd;
-
-#define BINARY_SINGLE()                                 \
-  template <typename T>                                 \
-  T operator()(T x, T y) {                              \
-    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value; \
-  }
-
-#define DEFAULT_BINARY_OP(Op, op)                       \
-  struct Op {                                           \
-    template <int N, typename T>                        \
-    Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
-      return op(x, y);                                  \
-    }                                                   \
-    BINARY_SINGLE()                                     \
-  };
-
-DEFAULT_BINARY_OP(Add, operator+)
-DEFAULT_BINARY_OP(ArcTan2, atan2)
-DEFAULT_BINARY_OP(Divide, operator/)
-DEFAULT_BINARY_OP(Multiply, operator*)
-DEFAULT_BINARY_OP(Subtract, operator-)
-DEFAULT_BINARY_OP(LogicalAnd, operator&&)
-DEFAULT_BINARY_OP(LogicalOr, operator||)
-DEFAULT_BINARY_OP(BitwiseAnd, operator&)
-DEFAULT_BINARY_OP(BitwiseOr, operator|)
-DEFAULT_BINARY_OP(BitwiseXor, operator^)
-DEFAULT_BINARY_OP(LeftShift, operator<<)
-DEFAULT_BINARY_OP(RightShift, operator>>)
-DEFAULT_BINARY_OP(Remainder, remainder)
-DEFAULT_BINARY_OP(Maximum, maximum)
-DEFAULT_BINARY_OP(Minimum, minimum)
-DEFAULT_BINARY_OP(Power, pow)
-
-#define DEFAULT_BOOL_OP(Op, op)                            \
-  struct Op {                                              \
-    template <int N, typename T>                           \
-    Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
-      return op(x, y);                                     \
-    }                                                      \
-    template <typename T>                                  \
-    bool operator()(T x, T y) {                            \
-      return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;  \
-    }                                                      \
-  };
-
-DEFAULT_BOOL_OP(Equal, operator==)
-DEFAULT_BOOL_OP(Greater, operator>)
-DEFAULT_BOOL_OP(GreaterEqual, operator>=)
-DEFAULT_BOOL_OP(Less, operator<)
-DEFAULT_BOOL_OP(LessEqual, operator<=)
-DEFAULT_BOOL_OP(NotEqual, operator!=)
-
-struct NaNEqual {
-  template <int N, typename T>
-  Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) {
-    return x == y || (isnan(x) && isnan(y));
-  }
-  template <typename T>
-  bool operator()(T x, T y) {
-    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;
-  }
-};
-
-struct LogAddExp {
-  template <int N, typename T>
-  Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) {
-    auto maxval = maximum(x, y);
-    auto minval = minimum(x, y);
-    auto mask = minval == -inf || maxval == inf;
-    auto out = maxval + log1p(exp(minval - maxval));
-    return select(mask, Simd<T, N>(maxval), Simd<T, N>(out));
-  }
-  BINARY_SINGLE()
-};
-
-struct Select {
-  template <typename T>
-  T operator()(bool condition, T x, T y) {
-    return (*this)(Simd<bool, 1>(condition), Simd<T, 1>(x), Simd<T, 1>(y))
-        .value;
-  }
-
-  template <int N, typename T>
-  Simd<T, N> operator()(Simd<bool, N> condition, Simd<T, N> x, Simd<T, N> y) {
-    return select(condition, x, y);
-  }
-};
-
-} // namespace mlx::core::detail
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Awni Hannun	0c1155faf5	binding + tests	2024-12-09 12:57:36 -08:00
Awni Hannun	2b9c24c517	works	2024-12-09 12:57:36 -08:00
Awni Hannun	ee59d50293	try dynamic reshape	2024-12-09 12:57:36 -08:00