Bump version (#1761 )

use sdpa and exportable functions in transformer multi head attention (#1760 )
Fix batched qmv bug (#1758 )
2025-09-10 21:37:50 +08:00 · 2025-01-09 13:48:20 -08:00 · 2025-01-09 13:11:55 -08:00 · 2025-01-09 11:45:57 -08:00 · 2025-01-09 11:23:19 -08:00 · 2025-01-09 11:04:24 -08:00
248 changed files with 10243 additions and 5013 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -85,7 +85,7 @@ jobs:
          name: Install dependencies
          command: |
            pip install --upgrade cmake
-            pip install nanobind==2.2.0
+            pip install nanobind==2.4.0
            pip install numpy
            sudo apt-get update
            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
@@ -137,7 +137,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.2.0
+            pip install nanobind==2.4.0
            pip install numpy
            pip install torch
            pip install tensorflow
@@ -226,7 +226,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.2.0
+            pip install nanobind==2.4.0
            pip install --upgrade setuptools
            pip install numpy
            pip install twine
@@ -291,7 +291,7 @@ jobs:
            source env/bin/activate
            pip install --upgrade pip
            pip install --upgrade cmake
-            pip install nanobind==2.2.0
+            pip install nanobind==2.4.0
            pip install --upgrade setuptools
            pip install numpy
            pip install auditwheel
--- a/.gitignore
+++ b/.gitignore
@@ -76,6 +76,9 @@ build/
 *.out
 *.app

+# Debug symbols
+*.pdb
+
 # VSCode 
 .vscode/
 .DS_Store
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.24)
+cmake_minimum_required(VERSION 3.25)

 project(mlx LANGUAGES C CXX)

@@ -20,12 +20,14 @@ option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
+option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)

 if(NOT MLX_VERSION)
-  set(MLX_VERSION 0.21.1)
+  set(MLX_VERSION 0.22.0)
 endif()
+add_compile_definitions("MLX_VERSION=${MLX_VERSION}")

 # --------------------- Processor tests -------------------------

@@ -93,8 +95,7 @@ elseif(MLX_BUILD_METAL)
  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")

  set(METAL_CPP_URL
-      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18-beta.zip
-  )
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)

  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
@@ -113,16 +114,55 @@ elseif(MLX_BUILD_METAL)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
 endif()

+if(WIN32)
+  if(MSVC)
+    # GGUF does not build with MSVC.
+    set(MLX_BUILD_GGUF OFF)
+    # There is no prebuilt OpenBLAS distribution for MSVC.
+    set(MLX_BUILD_BLAS_FROM_SOURCE ON)
+  endif()
+  # Windows implementation of dlfcn.h APIs.
+  FetchContent_Declare(
+    dlfcn-win32
+    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+    GIT_TAG v1.4.1
+    EXCLUDE_FROM_ALL)
+  block()
+  set(BUILD_SHARED_LIBS OFF)
+  FetchContent_MakeAvailable(dlfcn-win32)
+  endblock()
+  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
+  target_link_libraries(mlx PRIVATE dl)
+endif()
+
 if(MLX_BUILD_CPU)
  find_library(ACCELERATE_LIBRARY Accelerate)
  if(ACCELERATE_LIBRARY)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
-    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
-    add_compile_definitions(ACCELERATE_NEW_LAPACK)
  else()
    message(STATUS "Accelerate or arm neon not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
+  endif()
+
+  if(MLX_BUILD_ACCELERATE)
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
+    add_compile_definitions(ACCELERATE_NEW_LAPACK)
+  elseif(MLX_BUILD_BLAS_FROM_SOURCE)
+    # Download and build OpenBLAS from source code.
+    FetchContent_Declare(
+      openblas
+      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+      GIT_TAG v0.3.28
+      EXCLUDE_FROM_ALL)
+    set(BUILD_STATIC_LIBS ON) # link statically
+    set(NOFORTRAN ON) # msvc has no fortran compiler
+    FetchContent_MakeAvailable(openblas)
+    target_link_libraries(mlx PRIVATE openblas)
+    target_include_directories(
+      mlx PRIVATE "${openblas_SOURCE_DIR}/lapack-netlib/LAPACKE/include"
+                  "${CMAKE_BINARY_DIR}/generated" "${CMAKE_BINARY_DIR}")
+  else()
    if(${CMAKE_HOST_APPLE})
      # The blas shipped in macOS SDK is not supported, search homebrew for
      # openblas instead.
@@ -140,7 +180,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
-    target_link_libraries(mlx PUBLIC ${LAPACK_LIBRARIES})
+    target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
    # List blas after lapack otherwise we may accidentally incldue an old
    # version of lapack.h from the include dirs of blas.
    find_package(BLAS REQUIRED)
@@ -153,14 +193,7 @@ if(MLX_BUILD_CPU)
    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
-    target_link_libraries(mlx PUBLIC ${BLAS_LIBRARIES})
-
-    if(WIN32)
-      find_package(dlfcn-win32 REQUIRED)
-      message(STATUS "dlfcn-win32 lib " ${dlfcn-win32_LIBRARIES})
-      message(STATUS "dlfcn-win32 include " ${dlfcn-win32_INCLUDE_DIRS})
-      target_link_libraries(mlx PUBLIC ${dlfcn-win32_LIBRARIES})
-    endif()
+    target_link_libraries(mlx PRIVATE ${BLAS_LIBRARIES})
  endif()
 else()
  set(MLX_BUILD_ACCELERATE OFF)
@@ -207,8 +240,7 @@ if(MLX_BUILD_PYTHON_BINDINGS)
  execute_process(
    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
    OUTPUT_STRIP_TRAILING_WHITESPACE
-    OUTPUT_VARIABLE NB_DIR)
-  list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
+    OUTPUT_VARIABLE nanobind_ROOT)
  find_package(nanobind CONFIG REQUIRED)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
 endif()
--- a/benchmarks/cpp/autograd.cpp
+++ b/benchmarks/cpp/autograd.cpp
@@ -5,35 +5,35 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_value_and_grad() {
-  auto x = ones({200, 1000});
-  eval(x);
-  auto fn = [](array x) {
+  auto x = mx::ones({200, 1000});
+  mx::eval(x);
+  auto fn = [](mx::array x) {
    for (int i = 0; i < 20; ++i) {
-      x = log(exp(x));
+      x = mx::log(mx::exp(x));
    }
-    return sum(x);
+    return mx::sum(x);
  };

-  auto grad_fn = grad(fn);
+  auto grad_fn = mx::grad(fn);
  auto independent_value_and_grad = [&]() {
    auto value = fn(x);
    auto dfdx = grad_fn(x);
-    return std::vector<array>{value, dfdx};
+    return std::vector<mx::array>{value, dfdx};
  };
  TIME(independent_value_and_grad);

-  auto value_and_grad_fn = value_and_grad(fn);
+  auto value_and_grad_fn = mx::value_and_grad(fn);
  auto combined_value_and_grad = [&]() {
    auto [value, dfdx] = value_and_grad_fn(x);
-    return std::vector<array>{value, dfdx};
+    return std::vector<mx::array>{value, dfdx};
  };
  TIME(combined_value_and_grad);
 }

 int main() {
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_value_and_grad();
 }
--- a/benchmarks/cpp/compare_devices.cpp
+++ b/benchmarks/cpp/compare_devices.cpp
@@ -4,21 +4,21 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_add_op() {
  std::vector<int> sizes(1, 1);
  for (int i = 0; i < 9; ++i) {
    sizes.push_back(10 * sizes.back());
  }
-  set_default_device(Device::cpu);
+  set_default_device(mx::Device::cpu);
  for (auto size : sizes) {
-    auto a = random::uniform({size});
-    auto b = random::uniform({size});
-    eval(a, b);
+    auto a = mx::random::uniform({size});
+    auto b = mx::random::uniform({size});
+    mx::eval(a, b);
    std::cout << "Size " << size << std::endl;
-    TIMEM("cpu", add, a, b, Device::cpu);
-    TIMEM("gpu", add, a, b, Device::gpu);
+    TIMEM("cpu", mx::add, a, b, mx::Device::cpu);
+    TIMEM("gpu", mx::add, a, b, mx::Device::gpu);
  }
 }

--- a/benchmarks/cpp/irregular_strides.cpp
+++ b/benchmarks/cpp/irregular_strides.cpp
@@ -6,105 +6,105 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_irregular_binary_ops_1D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
-  auto a = random::uniform({size});
-  auto b = random::uniform({size});
-  eval(a, b);
+  auto a = mx::random::uniform({size});
+  auto b = mx::random::uniform({size});
+  mx::eval(a, b);
  a = slice(a, {0}, {size}, {step});
  b = slice(b, {0}, {size}, {step});
-  TIMEM("1D strided", add, a, b, device);
+  TIMEM("1D strided", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_2D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 2048;
-  auto a = random::uniform({size, size});
-  auto b = random::uniform({size, size});
-  eval(a, b);
-  TIMEM("2D regular", add, a, b, device);
+  auto a = mx::random::uniform({size, size});
+  auto b = mx::random::uniform({size, size});
+  mx::eval(a, b);
+  TIMEM("2D regular", mx::add, a, b, device);

-  b = transpose(b);
-  eval(b);
-  TIMEM("2D transpose", add, a, b, device);
+  b = mx::transpose(b);
+  mx::eval(b);
+  TIMEM("2D mx::transpose", mx::add, a, b, device);

-  b = random::uniform({size});
-  eval(b);
-  TIMEM("2D broadcast dim 0", add, a, b, device);
+  b = mx::random::uniform({size});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 0", mx::add, a, b, device);

-  b = reshape(b, {size, 1});
-  eval(b);
-  TIMEM("2D broadcast dim 1", add, a, b, device);
+  b = mx::reshape(b, {size, 1});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 1", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_3D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int d0 = 32;
  int d1 = 512;
  int d2 = 512;
-  auto a = random::uniform({d0, d1, d2});
-  auto b = random::uniform({d0, d1, d2});
-  TIMEM("3D regular", add, a, b, device);
+  auto a = mx::random::uniform({d0, d1, d2});
+  auto b = mx::random::uniform({d0, d1, d2});
+  TIMEM("3D regular", mx::add, a, b, device);

-  b = transpose(b, {0, 2, 1});
-  TIMEM("3D transpose", add, a, b, device);
+  b = mx::transpose(b, {0, 2, 1});
+  TIMEM("3D mx::transpose", mx::add, a, b, device);

-  b = random::uniform({d1, d2});
-  TIMEM("3D broadcast dim 0", add, a, b, device);
+  b = mx::random::uniform({d1, d2});
+  TIMEM("3D broadcast dim 0", mx::add, a, b, device);

-  b = random::uniform({d0, 1, d2});
-  TIMEM("3D broadcast dim 1", add, a, b, device);
+  b = mx::random::uniform({d0, 1, d2});
+  TIMEM("3D broadcast dim 1", mx::add, a, b, device);

-  b = random::uniform({d0, d1, 1});
-  TIMEM("3D broadcast dim 2", add, a, b, device);
+  b = mx::random::uniform({d0, d1, 1});
+  TIMEM("3D broadcast dim 2", mx::add, a, b, device);

-  b = random::uniform({d2});
-  TIMEM("3D broadcast dims 0, 1", add, a, b, device);
+  b = mx::random::uniform({d2});
+  TIMEM("3D broadcast dims 0, 1", mx::add, a, b, device);

-  b = random::uniform({d1, 1});
-  TIMEM("3D broadcast dims 0, 2", add, a, b, device);
+  b = mx::random::uniform({d1, 1});
+  TIMEM("3D broadcast dims 0, 2", mx::add, a, b, device);

-  b = random::uniform({d0, 1, 1});
-  TIMEM("3D broadcast dims 1, 2", add, a, b, device);
+  b = mx::random::uniform({d0, 1, 1});
+  TIMEM("3D broadcast dims 1, 2", mx::add, a, b, device);
 }

 void time_irregular_binary_ops_4D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  std::vector<int> shape = {8, 8, 512, 512};
-  auto a = random::uniform(shape);
-  auto b = random::uniform(shape);
+  auto a = mx::random::uniform(shape);
+  auto b = mx::random::uniform(shape);

-  TIMEM("4D regular", add, a, b, device);
+  TIMEM("4D regular", mx::add, a, b, device);

-  b = transpose(b, {0, 1, 3, 2});
-  TIMEM("4D transpose", add, a, b, device);
+  b = mx::transpose(b, {0, 1, 3, 2});
+  TIMEM("4D mx::transpose", mx::add, a, b, device);

  std::string om = "4D broadcast dims ";
  for (int i = 0; i < shape.size(); ++i) {
    shape[i] = 1;
-    b = random::uniform(shape);
+    b = mx::random::uniform(shape);
    std::ostringstream msg;
    msg << om << i;
-    TIMEM(msg.str(), add, a, b, device);
+    TIMEM(msg.str(), mx::add, a, b, device);

    for (int j = i + 1; j < shape.size(); ++j) {
      shape[j] = 1;
      std::ostringstream msg;
      msg << om << i << ", " << j;
-      b = random::uniform(shape);
-      TIMEM(msg.str(), add, a, b, device);
+      b = mx::random::uniform(shape);
+      TIMEM(msg.str(), mx::add, a, b, device);
      shape[j] = a.shape(j);

      for (int k = j + 1; k < shape.size(); ++k) {
        shape[k] = 1;
        std::ostringstream msg;
        msg << om << i << ", " << j << ", " << k;
-        b = random::uniform(shape);
-        TIMEM(msg.str(), add, a, b, device);
+        b = mx::random::uniform(shape);
+        TIMEM(msg.str(), mx::add, a, b, device);
        shape[k] = a.shape(k);
      }
    }
@@ -113,83 +113,83 @@ void time_irregular_binary_ops_4D() {
 }

 void time_irregular_reshape() {
-  auto device = default_device();
+  auto device = mx::default_device();
  std::vector<int> shape;
-  auto reshape_fn = [&shape, device](const array& a) {
-    return reshape(a, shape, device);
+  auto reshape_fn = [&shape, device](const mx::array& a) {
+    return mx::reshape(a, shape, device);
  };

  int size = 64;
  int d = 2 * size;

-  auto a = random::uniform({d, d, d});
+  auto a = mx::random::uniform({d, d, d});

  shape = {8 * size, size, size};
  TIMEM("3D contiguous", reshape_fn, a);

-  a = transpose(a);
+  a = mx::transpose(a);
  shape = {8 * size, size, size};
-  TIMEM("3D transpose", reshape_fn, a);
+  TIMEM("3D mx::transpose", reshape_fn, a);

-  a = transpose(a, {1, 2, 0});
+  a = mx::transpose(a, {1, 2, 0});
  shape = {8 * size, size, size};
-  TIMEM("3D transpose dims 1 2", reshape_fn, a);
+  TIMEM("3D mx::transpose dims 1 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, d}), {d, d, d});
  TIMEM("3D broadcast dim 0", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1, d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1, d}), {d, d, d});
  TIMEM("3D broadcast dim 1", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, d, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, d, 1}), {d, d, d});
  TIMEM("3D broadcast dim 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d}), {d, d, d});
  TIMEM("3D broadcast dims 0, 1", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1}), {d, d, d});
  TIMEM("3D broadcast dims 0, 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({d, 1, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({d, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2", reshape_fn, a);

-  a = broadcast_to(random::uniform({1, 1, 1}), {d, d, d});
+  a = mx::broadcast_to(mx::random::uniform({1, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2, 3", reshape_fn, a);
 }

 void time_irregular_astype_1D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
-  auto a = random::uniform({size});
+  auto a = mx::random::uniform({size});
  a = slice(a, {0}, {size}, {step});
-  TIMEM("1D strided", astype, a, int32, device);
+  TIMEM("1D strided", mx::astype, a, mx::int32, device);
 }

 void time_irregular_astype_2D() {
-  auto device = default_device();
+  auto device = mx::default_device();
  int size = 2048;
  std::vector<int> shape = {size, size};

-  auto a = random::uniform(shape);
-  TIMEM("2D regular", astype, a, int32, device);
+  auto a = mx::random::uniform(shape);
+  TIMEM("2D regular", mx::astype, a, mx::int32, device);

-  a = transpose(a);
-  TIMEM("2D transpose", astype, a, int32, device);
+  a = mx::transpose(a);
+  TIMEM("2D mx::transpose", mx::astype, a, mx::int32, device);

-  a = broadcast_to(random::uniform({size}), shape);
-  TIMEM("2D broadcast dim 0", astype, a, int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size}), shape);
+  TIMEM("2D broadcast dim 0", mx::astype, a, mx::int32, device);

-  a = broadcast_to(random::uniform({size, 1}), shape);
-  TIMEM("2D broadcast dim 1", astype, a, int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size, 1}), shape);
+  TIMEM("2D broadcast dim 1", mx::astype, a, mx::int32, device);
 }

 int main(int argc, char** argv) {
  if (argc > 1) {
    bool use_gpu = !strcmp(argv[1], "gpu");
-    set_default_device(use_gpu ? Device::gpu : Device::cpu);
+    set_default_device(use_gpu ? mx::Device::gpu : mx::Device::cpu);
  }
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_irregular_binary_ops_1D();
  time_irregular_binary_ops_2D();
  time_irregular_binary_ops_3D();
--- a/benchmarks/cpp/single_ops.cpp
+++ b/benchmarks/cpp/single_ops.cpp
@@ -3,20 +3,20 @@
 #include "mlx/mlx.h"
 #include "time_utils.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void time_creation_ops() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto full_fp32 = [&]() { return full(shape, 3.3f); };
+  auto full_fp32 = [&]() { return mx::full(shape, 3.3f); };
  TIME(full_fp32);
-  auto zeros_fp32 = [&]() { return zeros(shape, float32); };
+  auto zeros_fp32 = [&]() { return mx::zeros(shape, mx::float32); };
  TIME(zeros_fp32);
-  auto ones_fp32 = [&]() { return ones(shape, float32); };
+  auto ones_fp32 = [&]() { return mx::ones(shape, mx::float32); };
  TIME(ones_fp32);

-  auto arange_fp32 = [&]() { return arange(0.0, 10.0, 1e-4); };
+  auto arange_fp32 = [&]() { return mx::arange(0.0, 10.0, 1e-4); };
  TIME(arange_fp32);
 }

@@ -24,194 +24,196 @@ void time_type_conversions() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
-  auto device = default_device();
+  auto device = mx::default_device();

-  auto a = zeros(shape, float32);
-  eval(a);
-  TIMEM("float32 to int32", astype, a, int32, device);
-  TIMEM("float32 to uint32", astype, a, uint32, device);
+  auto a = mx::zeros(shape, mx::float32);
+  mx::eval(a);
+  TIMEM("mx::float32 to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("mx::float32 to mx::uint32", mx::astype, a, mx::uint32, device);

-  a = zeros(shape, int32);
-  eval(a);
-  TIMEM("int32 to float32", astype, a, float32, device);
+  a = mx::zeros(shape, mx::int32);
+  mx::eval(a);
+  TIMEM("mx::int32 to mx::float32", mx::astype, a, mx::float32, device);

-  a = zeros(shape, bool_);
-  eval(a);
-  TIMEM("bool to float32", astype, a, float32, device);
-  TIMEM("bool to int32", astype, a, int32, device);
-  TIMEM("bool to uint32", astype, a, uint32, device);
+  a = mx::zeros(shape, mx::bool_);
+  mx::eval(a);
+  TIMEM("bool to mx::float32", mx::astype, a, mx::float32, device);
+  TIMEM("bool to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("bool to mx::uint32", mx::astype, a, mx::uint32, device);
 }

 void time_random_generation() {
  int M = 2000;
  int N = 500;

-  auto uniform = [&]() { return random::uniform({M, N}, float32); };
+  auto uniform = [&]() { return mx::random::uniform({M, N}, mx::float32); };
  TIME(uniform);
-  auto normal = [&]() { return random::normal({M, N}, float32); };
+  auto normal = [&]() { return mx::random::normal({M, N}, mx::float32); };
  TIME(normal);
 }

 void time_unary_ops() {
  int M = 2000;
  int N = 500;
-  auto device = default_device();
+  auto device = mx::default_device();

-  auto a = random::normal({M, N});
-  eval(a);
+  auto a = mx::random::normal({M, N});
+  mx::eval(a);
  TIME(mlx::core::abs, a, device);
-  TIME(negative, a, device);
-  TIME(sign, a, device);
-  TIME(square, a, device);
+  TIME(mx::negative, a, device);
+  TIME(mx::sign, a, device);
+  TIME(mx::square, a, device);
  TIME(mlx::core::sqrt, a, device);
-  TIME(rsqrt, a, device);
+  TIME(mx::rsqrt, a, device);
  TIME(mlx::core::exp, a, device);

-  a = random::uniform({M, N});
+  a = mx::random::uniform({M, N});
  TIME(mlx::core::log, a, device);
 }

 void time_binary_ops() {
  int M = 1000, N = 100, K = 10;
-  auto condition = random::randint(0, 2, {M, N, K});
-  auto a = random::uniform({M, N, K});
-  auto b = random::uniform({M, N, K});
-  auto device = default_device();
-  eval(a, b);
+  auto condition = mx::random::randint(0, 2, {M, N, K});
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);

-  TIME(add, a, b, device);
-  TIME(subtract, a, b, device);
-  TIME(multiply, a, b, device);
-  TIME(divide, a, b, device);
-  TIME(maximum, a, b, device);
-  TIME(minimum, a, b, device);
-  TIME(where, condition, a, b, device);
+  TIME(mx::add, a, b, device);
+  TIME(mx::subtract, a, b, device);
+  TIME(mx::multiply, a, b, device);
+  TIME(mx::divide, a, b, device);
+  TIME(mx::maximum, a, b, device);
+  TIME(mx::minimum, a, b, device);
+  TIME(mx::where, condition, a, b, device);

-  condition = array({true});
-  b = random::uniform({1});
-  eval(b);
-  TIMEM("scalar", add, a, b, device);
-  TIMEM("vector-scalar", subtract, a, b, device);
-  TIMEM("scalar-vector", subtract, b, a, device);
-  TIMEM("scalar", multiply, a, b, device);
-  TIMEM("vector-scalar", divide, a, b, device);
-  TIMEM("scalar-vector", divide, b, a, device);
-  TIMEM("scalar-vector", where, condition, a, b, device);
+  condition = mx::array({true});
+  b = mx::random::uniform({1});
+  mx::eval(b);
+  TIMEM("scalar", mx::add, a, b, device);
+  TIMEM("vector-scalar", mx::subtract, a, b, device);
+  TIMEM("scalar-vector", mx::subtract, b, a, device);
+  TIMEM("scalar", mx::multiply, a, b, device);
+  TIMEM("vector-scalar", mx::divide, a, b, device);
+  TIMEM("scalar-vector", mx::divide, b, a, device);
+  TIMEM("scalar-vector", mx::where, condition, a, b, device);

-  condition = broadcast_to(array({true}), {1000, 100});
-  a = broadcast_to(random::uniform({1}), {1000, 100});
-  b = broadcast_to(random::uniform({1}), {1000, 100});
-  eval(a, b);
-  TIMEM("scalar-scalar broadcast", add, a, b, device);
-  TIMEM("scalar-scalar broadcast", subtract, a, b, device);
-  TIMEM("scalar-scalar broadcast", multiply, a, b, device);
-  TIMEM("scalar-scalar broadcast", divide, a, b, device);
-  TIMEM("scalar-scalar broadcast", where, condition, a, b, device);
+  condition = mx::broadcast_to(mx::array({true}), {1000, 100});
+  a = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  b = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  mx::eval(a, b);
+  TIMEM("scalar-scalar broadcast", mx::add, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::subtract, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::multiply, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::divide, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::where, condition, a, b, device);
 }

 void time_strided_ops() {
  int M = 50, N = 50, O = 50, P = 50;
-  auto a = random::uniform({M, N, O, P});
-  auto b = random::uniform({M, N, O, P});
-  auto device = default_device();
-  eval(a, b);
-  TIMEM("non-strided", add, a, b, device);
-  a = transpose(a, {1, 0, 2, 3});
-  b = transpose(b, {3, 2, 0, 1});
-  eval(a, b);
-  TIMEM("strided", add, a, b, device);
+  auto a = mx::random::uniform({M, N, O, P});
+  auto b = mx::random::uniform({M, N, O, P});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIMEM("non-strided", mx::add, a, b, device);
+  a = mx::transpose(a, {1, 0, 2, 3});
+  b = mx::transpose(b, {3, 2, 0, 1});
+  mx::eval(a, b);
+  TIMEM("strided", mx::add, a, b, device);
 }

 void time_comparisons() {
  int M = 1000, N = 100, K = 10;
-  auto a = random::uniform({M, N, K});
-  auto b = random::uniform({M, N, K});
-  auto device = default_device();
-  eval(a, b);
-  TIME(equal, a, b, device);
-  TIME(greater, a, b, device);
-  TIME(greater_equal, a, b, device);
-  TIME(less, a, b, device);
-  TIME(less_equal, a, b, device);
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::equal, a, b, device);
+  TIME(mx::greater, a, b, device);
+  TIME(mx::greater_equal, a, b, device);
+  TIME(mx::less, a, b, device);
+  TIME(mx::less_equal, a, b, device);
 }

 void time_matvec() {
  int M = 2000, N = 200;
-  auto a = random::uniform({M, N});
-  auto b = random::uniform({N});
-  auto c = random::uniform({M});
-  eval(a, b, c);
-  auto matvec = [&]() { return matmul(a, b); };
+  auto a = mx::random::uniform({M, N});
+  auto b = mx::random::uniform({N});
+  auto c = mx::random::uniform({M});
+  mx::eval(a, b, c);
+  auto matvec = [&]() { return mx::matmul(a, b); };
  TIME(matvec);

-  auto matvec_transpose = [&]() { return matmul(transpose(a), c); };
+  auto matvec_transpose = [&]() { return mx::matmul(mx::transpose(a), c); };
  TIME(matvec_transpose);
 }

 void time_matmul() {
  int M = 1000, N = 1000, K = 1000;
-  auto a = random::uniform({M, K});
-  auto b = random::uniform({K, N});
-  auto device = default_device();
-  eval(a, b);
-  TIME(matmul, a, b, device);
+  auto a = mx::random::uniform({M, K});
+  auto b = mx::random::uniform({K, N});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::matmul, a, b, device);

-  auto transpose_matmul = [&]() { return matmul(transpose(a), b); };
+  auto transpose_matmul = [&]() { return mx::matmul(mx::transpose(a), b); };
  TIME(transpose_matmul);
 }

 void time_reductions() {
-  auto a = random::normal({10000, 1000});
-  eval(a);
-  auto sum_all = [&a]() { return sum(a, false); };
+  auto a = mx::random::normal({10000, 1000});
+  mx::eval(a);
+  auto sum_all = [&a]() { return mx::sum(a, false); };
  TIME(sum_all);

-  auto sum_along_0 = [&a]() { return sum(a, 0, false); };
+  auto sum_along_0 = [&a]() { return mx::sum(a, 0, false); };
  TIME(sum_along_0);

-  auto sum_along_1 = [&a]() { return sum(a, 1, false); };
+  auto sum_along_1 = [&a]() { return mx::sum(a, 1, false); };
  TIME(sum_along_1);

-  auto prod_all = [&a]() { return prod(a, false); };
+  auto prod_all = [&a]() { return mx::prod(a, false); };
  TIME(prod_all);

-  auto all_true = [&a]() { return all(a, false); };
+  auto all_true = [&a]() { return mx::all(a, false); };
  TIME(all_true);

-  auto all_along_0 = [&a]() { return all(a, 0, false); };
+  auto all_along_0 = [&a]() { return mx::all(a, 0, false); };
  TIME(all_along_0);

-  auto all_along_1 = [&a]() { return all(a, 1, false); };
+  auto all_along_1 = [&a]() { return mx::all(a, 1, false); };
  TIME(all_along_1);

-  auto any_true = [&a]() { return any(a, false); };
+  auto any_true = [&a]() { return mx::any(a, false); };
  TIME(any_true);

-  auto argmin_along_0 = [&a]() { return argmin(a, 0, false); };
+  auto argmin_along_0 = [&a]() { return mx::argmin(a, 0, false); };
  TIME(argmin_along_0);

-  auto argmin_along_1 = [&a]() { return argmin(a, 1, false); };
+  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);
 }

 void time_gather_scatter() {
-  auto a = random::normal({1000, 768});
-  eval(a);
-  auto indices = random::randint(0, 1000, {256});
-  eval(indices);
+  auto a = mx::random::normal({1000, 768});
+  mx::eval(a);
+  auto indices = mx::random::randint(0, 1000, {256});
+  mx::eval(indices);

-  auto embedding_lookup = [&a, &indices]() { return take(a, indices, 0); };
+  auto embedding_lookup = [&a, &indices]() { return mx::take(a, indices, 0); };
  TIME(embedding_lookup);

-  indices = random::randint(0, 768 * 1000, {256 * 768});
-  eval(indices);
+  indices = mx::random::randint(0, 768 * 1000, {256 * 768});
+  mx::eval(indices);

-  auto single_element_lookup = [&a, &indices]() { return take(a, indices); };
+  auto single_element_lookup = [&a, &indices]() {
+    return mx::take(a, indices);
+  };
  TIME(single_element_lookup);

-  indices = random::randint(0, 1000, {256});
-  auto updates = random::normal({256, 1, 768});
-  eval(indices, updates);
+  indices = mx::random::randint(0, 1000, {256});
+  auto updates = mx::random::normal({256, 1, 768});
+  mx::eval(indices, updates);

  auto embedding_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -223,10 +225,10 @@ void time_gather_scatter() {
  };
  TIME(embedding_add);

-  a = reshape(a, {-1});
-  indices = random::randint(0, 768 * 1000, {768 * 256});
-  updates = random::normal({256 * 768, 1});
-  eval(a, indices, updates);
+  a = mx::reshape(a, {-1});
+  indices = mx::random::randint(0, 768 * 1000, {768 * 256});
+  updates = mx::random::normal({256 * 768, 1});
+  mx::eval(a, indices, updates);

  auto single_element_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
@@ -240,21 +242,21 @@ void time_gather_scatter() {
 }

 void time_divmod() {
-  auto a = random::normal({1000});
-  auto b = random::normal({1000});
-  eval({a, b});
+  auto a = mx::random::normal({1000});
+  auto b = mx::random::normal({1000});
+  mx::eval({a, b});

-  auto divmod_fused = [&a, &b]() { return divmod(a, b); };
+  auto divmod_fused = [&a, &b]() { return mx::divmod(a, b); };
  TIME(divmod_fused);

  auto divmod_separate = [&a, &b]() {
-    return std::vector<array>{floor_divide(a, b), remainder(a, b)};
+    return std::vector<mx::array>{mx::floor_divide(a, b), mx::remainder(a, b)};
  };
  TIME(divmod_separate);
 }

 int main() {
-  std::cout << "Benchmarks for " << default_device() << std::endl;
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_creation_ops();
  time_type_conversions();
  time_unary_ops();
--- a/benchmarks/python/sdpa_vector_bench.py
+++ b/benchmarks/python/sdpa_vector_bench.py
@@ -12,7 +12,7 @@ dtype = mx.float16
 loops = 10


-def attention(q, k, v):
+def attention(q, k, v, mask=None):
    def _sdpa(q, k, v):
        B, Hq, L, D = q.shape
        _, Hk, S, _ = k.shape
@@ -20,6 +20,9 @@ def attention(q, k, v):
        k = k[:, :, None, :, :]
        v = v[:, :, None, :, :]
        s = q @ k.transpose(0, 1, 2, 4, 3)
+        if mask is not None:
+            m = mx.broadcast_to(mask, (B, Hq, L, S)).reshape(B, Hk, Hq // Hk, L, S)
+            s = mx.where(m, s, mx.finfo(s.dtype).min)
        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
        o = p @ v
        return o.reshape(B, Hq, L, D)
@@ -29,9 +32,9 @@ def attention(q, k, v):
    return q


-def sdpa(q, k, v):
+def sdpa(q, k, v, mask=None):
    for i in range(loops):
-        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0)
+        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
    return q


@@ -53,6 +56,26 @@ def time_self_attention_sdpa():
    time_fn(sdpa, q, k, v)


+def time_self_attention_sdpa_with_mask():
+    mx.random.seed(3)
+    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
+    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    v = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
+    mask = mx.full((L,), True)
+    mask[L // 2 :] = False
+    mx.eval(q, k, v, mask)
+
+    def sdpa_mask(*args):
+        return sdpa(*args, mask=mask)
+
+    def attention_mask(*args):
+        return attention(*args, mask=mask)
+
+    time_fn(attention_mask, q, k, v)
+    time_fn(sdpa_mask, q, k, v)
+
+
 if __name__ == "__main__":
    time_self_attention_sdpa()
    time_self_attention_primitives()
+    time_self_attention_sdpa_with_mask()
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@@ -420,8 +420,8 @@ element in the output.
            constant const float& alpha [[buffer(3)]],
            constant const float& beta [[buffer(4)]],
            constant const int* shape [[buffer(5)]],
-            constant const size_t* x_strides [[buffer(6)]],
-            constant const size_t* y_strides [[buffer(7)]],
+            constant const int64_t* x_strides [[buffer(6)]],
+            constant const int64_t* y_strides [[buffer(7)]],
            constant const int& ndim [[buffer(8)]],
            uint index [[thread_position_in_grid]]) {
        // Convert linear indices to offsets in array
@@ -438,24 +438,10 @@ each instantiation a unique host name so we can identify it.

 .. code-block:: C++

-    #define instantiate_axpby(type_name, type)              \
-        template [[host_name("axpby_general_" #type_name)]] \
-        [[kernel]] void axpby_general<type>(                \
-            device const type* x [[buffer(0)]],             \
-            device const type* y [[buffer(1)]],             \
-            device type* out [[buffer(2)]],                 \
-            constant const float& alpha [[buffer(3)]],      \
-            constant const float& beta [[buffer(4)]],       \
-            constant const int* shape [[buffer(5)]],        \
-            constant const size_t* x_strides [[buffer(6)]], \
-            constant const size_t* y_strides [[buffer(7)]], \
-            constant const int& ndim [[buffer(8)]],         \
-            uint index [[thread_position_in_grid]]);
-
-    instantiate_axpby(float32, float);
-    instantiate_axpby(float16, half);
-    instantiate_axpby(bfloat16, bfloat16_t);
-    instantiate_axpby(complex64, complex64_t);
+    instantiate_kernel("axpby_general_float32", axpby_general, float)
+    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
+    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
+    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)

 The logic to determine the kernel, set the inputs, resolve the grid dimensions,
 and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown
--- a/docs/src/dev/mlx_in_cpp.rst
+++ b/docs/src/dev/mlx_in_cpp.rst
@@ -0,0 +1,121 @@
+.. _mlx_in_cpp:
+
+Using MLX in C++
+================
+
+You can use MLX in a C++ project with CMake.
+
+.. note::
+
+  This guide is based one the following `example using MLX in C++ 
+  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_
+
+First install MLX:
+
+.. code-block:: bash
+
+  pip install -U mlx
+
+You can also install the MLX Python package from source or just the C++
+library. For more information see the :ref:`documentation on installing MLX
+<build_and_install>`.
+
+Next make an example program in ``example.cpp``: 
+
+.. code-block:: C++
+
+  #include <iostream>
+
+  #include "mlx/mlx.h"
+
+  namespace mx = mlx::core;
+
+  int main() {
+    auto x = mx::array({1, 2, 3});
+    auto y = mx::array({1, 2, 3});
+    std::cout << x + y << std::endl;
+    return 0;
+  }
+
+The next step is to setup a CMake file in ``CMakeLists.txt``:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.27)
+
+  project(example LANGUAGES CXX)
+
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
+Depending on how you installed MLX, you may need to tell CMake where to
+find it. 
+
+If you installed MLX with Python, then add the following to the CMake file:
+
+.. code-block:: cmake
+
+  find_package(
+    Python 3.9
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE MLX_ROOT)
+
+If you installed the MLX C++ package to a system path, then CMake should be
+able to find it. If you installed it to a non-standard location or CMake can't
+find MLX then set ``MLX_ROOT`` to the location where MLX is installed:
+
+.. code-block:: cmake
+
+  set(MLX_ROOT "/path/to/mlx/")
+
+Next, instruct CMake to find MLX:
+
+.. code-block:: cmake
+
+  find_package(MLX CONFIG REQUIRED)
+
+Finally, add the ``example.cpp`` program as an executable and link MLX.
+
+.. code-block:: cmake
+
+  add_executable(example example.cpp)
+  target_link_libraries(example PRIVATE mlx)
+
+You can build the example with:
+
+.. code-block:: bash
+
+  cmake -B build -DCMAKE_BUILD_TYPE=Release
+  cmake --build build
+
+And run it with:
+
+.. code-block:: bash
+
+  ./build/example
+
+Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:
+
+.. list-table:: Package Variables
+   :widths: 20 20 
+   :header-rows: 1
+
+   * - Variable 
+     - Description 
+   * - MLX_FOUND
+     - ``True`` if MLX is found
+   * - MLX_INCLUDE_DIRS
+     - Include directory
+   * - MLX_LIBRARIES
+     - Libraries to link against
+   * - MLX_CXX_FLAGS
+     - Additional compiler flags
+   * - MLX_BUILD_ACCELERATE
+     - ``True`` if MLX was built with Accelerate 
+   * - MLX_BUILD_METAL
+     - ``True`` if MLX was built with Metal
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -45,6 +45,7 @@ are the CPU and GPU.
   usage/numpy
   usage/distributed
   usage/using_streams
+   usage/export

 .. toctree::
   :caption: Examples
@@ -61,6 +62,7 @@ are the CPU and GPU.
   python/array
   python/data_types
   python/devices_and_streams
+   python/export
   python/ops
   python/random
   python/transforms
@@ -86,3 +88,4 @@ are the CPU and GPU.
   dev/extensions
   dev/metal_debugger
   dev/custom_metal_kernels
+   dev/mlx_in_cpp
--- a/docs/src/install.rst
+++ b/docs/src/install.rst
@@ -1,3 +1,5 @@
+.. _build_and_install:
+
 Build and Install
 =================

@@ -53,7 +55,7 @@ Build Requirements
 ^^^^^^^^^^^^^^^^^^

 - A C++ compiler with C++17 support (e.g. Clang >= 5.0)
- `cmake <https://cmake.org/>`_ -- version 3.24 or later, and ``make``
+- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
 - Xcode >= 15.0 and macOS SDK >= 14.0

 .. note::
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@@ -66,3 +66,4 @@ documentation for more information. Use :func:`issubdtype` to determine if one
   Dtype
   DtypeCategory
   issubdtype
+   finfo
--- a/docs/src/python/export.rst
+++ b/docs/src/python/export.rst
@@ -0,0 +1,14 @@
+.. _export:
+
+Export Functions
+================
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+   export_function
+   import_function
+   exporter
+   export_to_dot
--- a/docs/src/python/ops.rst
+++ b/docs/src/python/ops.rst
@@ -89,6 +89,7 @@ Operations
   isneginf
   isposinf
   issubdtype
+   kron
   left_shift
   less
   less_equal
@@ -144,6 +145,8 @@ Operations
   sign
   sin
   sinh
+   slice
+   slice_update
   softmax
   sort
   split
@@ -168,6 +171,7 @@ Operations
   tri
   tril
   triu
+   unflatten
   var
   view
   where
--- a/docs/src/usage/compile.rst
+++ b/docs/src/usage/compile.rst
@@ -421,3 +421,77 @@ the most opportunity to optimize the computation graph:
  # Compiling the outer function is good to do as it will likely
  # be faster even though the inner functions are compiled
  fun = mx.compile(outer)
+
+
+
+.. _shapeless_compile:
+
+Shapeless Compilation
+---------------------
+
+When the shape of an input to a compiled function changes, the function is
+recompiled. You can compile a function once and run it on inputs with
+variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
+case changes to the shapes of the inputs do not cause the function to be
+recompiled.
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.abs(x + y)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.array(1.0)
+  y = mx.array(-2.0)
+
+  # Firt call compiles the function
+  print(compiled_fun(x, y))
+
+  # Second call with different shapes
+  # does not recompile the function
+  x = mx.array([1.0, -6.0])
+  y = mx.array([-2.0, 3.0])
+  print(compiled_fun(x, y))
+
+
+Use shapeless compilations carefully. Since compilation is not triggered when
+shapes change, any graphs which are conditional on the input shapes will not
+work as expected. Shape-dependent computations are common and sometimes subtle
+to detect. For example:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.reshape(x.shape[0] * x.shape[1], -1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Error, can't reshape (5, 5, 3) to (6, -1)
+  out = compiled_fun(x)
+
+The second call to the ``compiled_fun`` fails because of the call to
+:func:`reshape` which uses the static shape of ``x`` in the first call. We can
+fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:
+
+.. code-block:: python
+
+  def fun(x):
+      return x.flatten(0, 1)
+
+  compiled_fun = mx.compile(fun, shapeless=True)
+
+  x = mx.random.uniform(shape=(2, 3, 4))
+
+  out = compiled_fun(x)
+
+  x = mx.random.uniform(shape=(5, 5, 3))
+
+  # Ok
+  out = compiled_fun(x)
--- a/docs/src/usage/distributed.rst
+++ b/docs/src/usage/distributed.rst
@@ -141,12 +141,13 @@ everything else remaining the same.
    from mlx.utils import tree_map

    def all_reduce_grads(grads):
-        N = mx.distributed.init()
+        N = mx.distributed.init().size()
        if N == 1:
            return grads
        return tree_map(
-                lambda x: mx.distributed.all_sum(x) / N,
-                grads)
+            lambda x: mx.distributed.all_sum(x) / N,
+            grads
+        )

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
--- a/docs/src/usage/export.rst
+++ b/docs/src/usage/export.rst
@@ -0,0 +1,288 @@
+.. _export_usage:
+
+Exporting Functions
+===================
+
+.. currentmodule:: mlx.core
+
+MLX has an API to export and import functions to and from a file. This lets you
+run computations written in one MLX front-end (e.g. Python) in another MLX
+front-end (e.g. C++). 
+
+This guide walks through the basics of the MLX export API with some examples.
+To see the full list of functions check-out the :ref:`API documentation
+<export>`.
+
+Basics of Exporting 
+-------------------
+
+Let's start with a simple example:
+ 
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("add.mlxfn", fun, x, y)
+
+To export a function, provide sample input arrays that the function
+can be called with. The data doesn't matter, but the shapes and types of the
+arrays do. In the above example we exported ``fun`` with two ``float32``
+scalar arrays. We can then import the function and run it:
+
+.. code-block:: python
+
+  add_fun = mx.import_function("add.mlxfn")
+
+  out, = add_fun(mx.array(1.0), mx.array(2.0))
+  # Prints: array(3, dtype=float32)
+  print(out)
+
+  out, = add_fun(mx.array(1.0), mx.array(3.0))
+  # Prints: array(4, dtype=float32)
+  print(out)
+
+  # Raises an exception
+  add_fun(mx.array(1), mx.array(3.0))
+
+  # Raises an exception
+  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))
+
+Notice the third and fourth calls to ``add_fun`` raise exceptions because the
+shapes and types of the inputs are different than the shapes and types of the
+example inputs we exported the function with.
+
+Also notice that even though the original ``fun`` returns a single output
+array, the imported function always returns a tuple of one or more arrays.
+
+The inputs to :func:`export_function` and to an imported function can be
+specified as variable positional arguments or as a tuple of arrays:
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+   
+  # Both arguments to fun are positional
+  mx.export_function("add.mlxfn", fun, x, y)
+
+  # Same as above
+  mx.export_function("add.mlxfn", fun, (x, y))
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y)
+
+  # Also ok
+  out, = imported_fun((x, y))
+
+You can pass example inputs to functions as positional or keyword arguments. If
+you use keyword arguments to export the function, then you have to use the same
+keyword arguments when calling the imported function.
+
+.. code-block:: python
+
+  def fun(x, y):
+    return x + y
+
+  # One argument to fun is positional, the other is a kwarg
+  mx.export_function("add.mlxfn", fun, x, y=y)
+
+  imported_fun = mx.import_function("add.mlxfn")
+
+  # Ok
+  out, = imported_fun(x, y=y)
+
+  # Also ok
+  out, = imported_fun((x,), {"y": y})
+
+  # Raises since the keyword argument is missing
+  out, = imported_fun(x, y)
+
+  # Raises since the keyword argument has the wrong key
+  out, = imported_fun(x, z=y)
+
+
+Exporting Modules
+-----------------
+
+An :obj:`mlx.nn.Module` can be exported with or without the parameters included
+in the exported function. Here's an example:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x):
+      return model(x)
+
+   mx.export_function("model.mlxfn", call, mx.zeros(4))
+
+In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
+parameters are also saved to the ``model.mlxfn`` file.
+
+.. note::
+
+   For enclosed arrays inside an exported function, be extra careful to ensure
+   they are evaluated. The computation graph that gets exported will include
+   the computation that produces enclosed inputs.
+  
+   If the above example was missing ``mx.eval(model.parameters()``, the
+   exported function would include the random initialization of the
+   :obj:`mlx.nn.Module` parameters.
+
+If you only want to export the ``Module.__call__`` function without the
+parameters, pass them as inputs to the ``call`` wrapper:
+
+.. code-block:: python
+
+   model = nn.Linear(4, 4)
+   mx.eval(model.parameters())
+
+   def call(x, **params):
+     # Set the model's parameters to the input parameters
+     model.update(tree_unflatten(list(params.items())))
+     return model(x)
+ 
+   params = dict(tree_flatten(model.parameters()))
+   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)
+
+
+Shapeless Exports
+-----------------
+
+Just like :func:`compile`, functions can also be exported for dynamically shaped
+inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
+to export a function which can be used for inputs with variable shapes:
+
+.. code-block:: python
+
+  mx.export_function("fun.mlxfn", mx.abs, mx.array(0.0), shapeless=True)
+  imported_abs = mx.import_function("fun.mlxfn")
+
+  # Ok
+  out, = imported_abs(mx.array(-1.0))
+  
+  # Also ok 
+  out, = imported_abs(mx.array([-1.0, -2.0]))
+
+With ``shapeless=False`` (which is the default), the second call to
+``imported_abs`` would raise an exception with a shape mismatch.
+
+Shapeless exporting works the same as shapeless compilation and should be
+used carefully. See the :ref:`documentation on shapeless compilation
+<shapeless_compile>` for more information.
+
+Exporting Multiple Traces
+-------------------------
+
+In some cases, functions build different computation graphs for different
+input arguments. A simple way to manage this is to export to a new file with
+each set of inputs. This is a fine option in many cases. But it can be
+suboptimal if the exported functions have a large amount of duplicate constant
+data (for example the parameters of a :obj:`mlx.nn.Module`).
+
+The export API in MLX lets you export multiple traces of the same function to
+a single file by creating an exporting context manager with :func:`exporter`:
+
+.. code-block:: python
+
+  def fun(x, y=None):
+      constant = mx.array(3.0)
+      if y is not None:
+        x += y 
+      return x + constant
+
+  with mx.exporter("fun.mlxfn", fun) as exporter:
+      exporter(mx.array(1.0))
+      exporter(mx.array(1.0), y=mx.array(0.0))
+
+  imported_function = mx.import_function("fun.mlxfn")
+
+  # Call the function with y=None
+  out, = imported_function(mx.array(1.0))
+  print(out)
+
+  # Call the function with y specified
+  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
+  print(out)
+
+In the above example the function constant data, (i.e. ``constant``), is only
+saved once. 
+
+Transformations with Imported Functions
+---------------------------------------
+
+Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
+on imported functions just like regular Python functions:
+
+.. code-block:: python
+
+  def fun(x):
+      return mx.sin(x)
+
+  x = mx.array(0.0)
+  mx.export_function("sine.mlxfn", fun, x)
+
+  imported_fun = mx.import_function("sine.mlxfn")
+
+  # Take the derivative of the imported function
+  dfdx = mx.grad(lambda x: imported_fun(x)[0])
+  # Prints: array(1, dtype=float32)
+  print(dfdx(x))
+
+  # Compile the imported function 
+  mx.compile(imported_fun)
+  # Prints: array(0, dtype=float32)
+  print(compiled_fun(x)[0])
+
+
+Importing Functions in C++
+--------------------------
+
+Importing and running functions in C++ is basically the same as importing and
+running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
+setup a simple C++ project that uses MLX as a library.
+
+Next, export a simple function from Python:
+
+.. code-block:: python
+
+  def fun(x, y):
+      return mx.exp(x + y)
+
+  x = mx.array(1.0)
+  y = mx.array(1.0)
+  mx.export_function("fun.mlxfn", fun, x, y)
+
+
+Import and run the function in C++ with only a few lines of code:
+
+.. code-block:: c++
+
+  auto fun = mx::import_function("fun.mlxfn");
+
+  auto inputs = {mx::array(1.0), mx::array(1.0)};
+  auto outputs = fun(inputs);
+
+  // Prints: array(2, dtype=float32)
+  std::cout << outputs[0] << std::endl;
+
+Imported functions can be transformed in C++ just like in Python. Use 
+``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
+mx::array>`` for keyword arguments when calling imported functions in C++.
+
+More Examples
+-------------
+
+Here are a few more complete examples exporting more complex functions from
+Python and importing and running them in C++:
+
+* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_
--- a/examples/cmake_project/CMakeLists.txt
+++ b/examples/cmake_project/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.27)
+
+project(example LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Comment the following two commands only the MLX C++ library is installed and
+# set(MLX_ROOT "/path/to/mlx") directly if needed.
+find_package(
+  Python 3.9
+  COMPONENTS Interpreter Development.Module
+  REQUIRED)
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE MLX_ROOT)
+
+find_package(MLX CONFIG REQUIRED)
+
+add_executable(example example.cpp)
+target_link_libraries(example PRIVATE mlx)
--- a/examples/cmake_project/README.md
+++ b/examples/cmake_project/README.md
@@ -0,0 +1,26 @@
+## Build and Run 
+
+Install MLX with Python:
+
+```bash
+pip install mlx>=0.22
+```
+
+Build the C++ example:
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build
+```
+
+Run the C++ example:
+
+```
+./build/example
+```
+
+which should output:
+
+```
+array([2, 4, 6], dtype=int32)
+```
--- a/examples/cmake_project/example.cpp
+++ b/examples/cmake_project/example.cpp
@@ -0,0 +1,14 @@
+// Copyright © 2024 Apple Inc.
+
+#include <iostream>
+
+#include "mlx/mlx.h"
+
+namespace mx = mlx::core;
+
+int main() {
+  auto x = mx::array({1, 2, 3});
+  auto y = mx::array({1, 2, 3});
+  std::cout << x + y << std::endl;
+  return 0;
+}
--- a/examples/cpp/distributed.cpp
+++ b/examples/cpp/distributed.cpp
@@ -4,19 +4,19 @@

 #include "mlx/mlx.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 int main() {
-  if (!distributed::is_available()) {
+  if (!mx::distributed::is_available()) {
    std::cout << "No communication backend found" << std::endl;
    return 1;
  }

-  auto global_group = distributed::init();
+  auto global_group = mx::distributed::init();
  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;

-  array x = ones({10});
-  array out = distributed::all_sum(x, global_group);
+  mx::array x = mx::ones({10});
+  mx::array out = mx::distributed::all_sum(x, global_group);

  std::cout << out << std::endl;
 }
--- a/examples/cpp/linear_regression.cpp
+++ b/examples/cpp/linear_regression.cpp
@@ -10,7 +10,7 @@
 /**
 * An example of linear regression with MLX.
 */
-using namespace mlx::core;
+namespace mx = mlx::core;

 int main() {
  int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
  float learning_rate = 0.01;

  // True parameters
-  auto w_star = random::normal({num_features});
+  auto w_star = mx::random::normal({num_features});

  // The input examples (design matrix)
-  auto X = random::normal({num_examples, num_features});
+  auto X = mx::random::normal({num_examples, num_features});

  // Noisy labels
-  auto eps = 1e-2 * random::normal({num_examples});
-  auto y = matmul(X, w_star) + eps;
+  auto eps = 1e-2 * mx::random::normal({num_examples});
+  auto y = mx::matmul(X, w_star) + eps;

  // Initialize random parameters
-  array w = 1e-2 * random::normal({num_features});
+  mx::array w = 1e-2 * mx::random::normal({num_features});

-  auto loss_fn = [&](array w) {
-    auto yhat = matmul(X, w);
-    return (0.5f / num_examples) * sum(square(yhat - y));
+  auto loss_fn = [&](mx::array w) {
+    auto yhat = mx::matmul(X, w);
+    return (0.5f / num_examples) * mx::sum(mx::square(yhat - y));
  };

-  auto grad_fn = grad(loss_fn);
+  auto grad_fn = mx::grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
-    auto grad = grad_fn(w);
-    w = w - learning_rate * grad;
-    eval(w);
+    auto grads = grad_fn(w);
+    w = w - learning_rate * grads;
+    mx::eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
-  auto error_norm = std::sqrt(sum(square(w - w_star)).item<float>());
+  auto error_norm = std::sqrt(mx::sum(mx::square(w - w_star)).item<float>());
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", |w - w*| = " << error_norm
            << ", Throughput " << throughput << " (it/s)." << std::endl;
--- a/examples/cpp/logistic_regression.cpp
+++ b/examples/cpp/logistic_regression.cpp
@@ -10,7 +10,7 @@
 /**
 * An example of logistic regression with MLX.
 */
-using namespace mlx::core;
+namespace mx = mlx::core;

 int main() {
  int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
  float learning_rate = 0.1;

  // True parameters
-  auto w_star = random::normal({num_features});
+  auto w_star = mx::random::normal({num_features});

  // The input examples
-  auto X = random::normal({num_examples, num_features});
+  auto X = mx::random::normal({num_examples, num_features});

  // Labels
-  auto y = matmul(X, w_star) > 0;
+  auto y = mx::matmul(X, w_star) > 0;

  // Initialize random parameters
-  array w = 1e-2 * random::normal({num_features});
+  mx::array w = 1e-2 * mx::random::normal({num_features});

-  auto loss_fn = [&](array w) {
-    auto logits = matmul(X, w);
+  auto loss_fn = [&](mx::array w) {
+    auto logits = mx::matmul(X, w);
    auto scale = (1.0f / num_examples);
-    return scale * sum(logaddexp(array(0.0f), logits) - y * logits);
+    return scale * mx::sum(mx::logaddexp(mx::array(0.0f), logits) - y * logits);
  };

-  auto grad_fn = grad(loss_fn);
+  auto grad_fn = mx::grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
-    auto grad = grad_fn(w);
-    w = w - learning_rate * grad;
-    eval(w);
+    auto grads = grad_fn(w);
+    w = w - learning_rate * grads;
+    mx::eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
-  auto acc = sum((matmul(X, w) > 0) == y) / num_examples;
+  auto acc = mx::sum((mx::matmul(X, w) > 0) == y) / num_examples;
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", Accuracy, " << acc << ", Throughput "
            << throughput << " (it/s)." << std::endl;
--- a/examples/cpp/metal_capture.cpp
+++ b/examples/cpp/metal_capture.cpp
@@ -5,27 +5,27 @@

 #include "mlx/mlx.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 int main() {
  // To use Metal debugging and profiling:
  // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
  // 2. Run with MTL_CAPTURE_ENABLED=1.
-  metal::start_capture("mlx_trace.gputrace");
+  mx::metal::start_capture("mlx_trace.gputrace");

  // Start at index two because the default GPU and CPU streams have indices
  // zero and one, respectively. This naming matches the label assigned to each
  // stream's command queue.
-  auto s2 = new_stream(Device::gpu);
-  auto s3 = new_stream(Device::gpu);
+  auto s2 = new_stream(mx::Device::gpu);
+  auto s3 = new_stream(mx::Device::gpu);

-  auto a = arange(1.f, 10.f, 1.f, float32, s2);
-  auto b = arange(1.f, 10.f, 1.f, float32, s3);
-  auto x = add(a, a, s2);
-  auto y = add(b, b, s3);
+  auto a = mx::arange(1.f, 10.f, 1.f, mx::float32, s2);
+  auto b = mx::arange(1.f, 10.f, 1.f, mx::float32, s3);
+  auto x = mx::add(a, a, s2);
+  auto y = mx::add(b, b, s3);

  // The multiply will happen on the default stream.
-  std::cout << multiply(x, y) << std::endl;
+  std::cout << mx::multiply(x, y) << std::endl;

-  metal::stop_capture();
+  mx::metal::stop_capture();
 }
--- a/examples/cpp/tutorial.cpp
+++ b/examples/cpp/tutorial.cpp
@@ -5,11 +5,11 @@

 #include "mlx/mlx.h"

-using namespace mlx::core;
+namespace mx = mlx::core;

 void array_basics() {
  // Make a scalar array:
-  array x(1.0);
+  mx::array x(1.0);

  // Get the value out of it:
  auto s = x.item<float>();
@@ -29,31 +29,31 @@ void array_basics() {

  // The datatype should be float32:
  auto dtype = x.dtype();
-  assert(dtype == float32);
+  assert(dtype == mx::float32);

  // Specify the dtype when constructing the array:
-  x = array(1, int32);
-  assert(x.dtype() == int32);
+  x = mx::array(1, mx::int32);
+  assert(x.dtype() == mx::int32);
  x.item<int>(); // OK
  // x.item<float>();  // Undefined!

  // Make a multidimensional array:
-  x = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  x = mx::array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  // mlx is row-major by default so the first row of this array
  // is [1.0, 2.0] and the second row is [3.0, 4.0]

  // Make an array of shape {2, 2} filled with ones:
-  auto y = ones({2, 2});
+  auto y = mx::ones({2, 2});

  // Pointwise add x and y:
-  auto z = add(x, y);
+  auto z = mx::add(x, y);

  // Same thing:
  z = x + y;

  // mlx is lazy by default. At this point `z` only
  // has a shape and a type but no actual data:
-  assert(z.dtype() == float32);
+  assert(z.dtype() == mx::float32);
  assert(z.shape(0) == 2);
  assert(z.shape(1) == 2);

@@ -63,33 +63,33 @@ void array_basics() {
  // and inputs. When `eval` is called on an array (or arrays), the array and
  // all of its dependencies are recursively evaluated to produce the result.
  // Once an array is evaluated, it has data and is detached from its inputs.
-  eval(z);
+  mx::eval(z);

-  // Of course the array can still be an input to other operations. You can even
-  // call eval on the array again, this will just be a no-op:
-  eval(z); // no-op
+  // Of course the array can still be an input to other operations. You can
+  // even call eval on the array again, this will just be a no-op:
+  mx::eval(z); // no-op

  // Some functions or methods on arrays implicitly evaluate them. For example
  // accessing a value in an array or printing the array implicitly evaluate it:
-  z = ones({1});
+  z = mx::ones({1});
  z.item<float>(); // implicit evaluation

-  z = ones({2, 2});
+  z = mx::ones({2, 2});
  std::cout << z << std::endl; // implicit evaluation
 }

 void automatic_differentiation() {
-  auto fn = [](array x) { return square(x); };
+  auto fn = [](mx::array x) { return mx::square(x); };

  // Computing the derivative function of a function
-  auto grad_fn = grad(fn);
+  auto grad_fn = mx::grad(fn);
  // Call grad_fn on the input to get the derivative
-  auto x = array(1.5);
+  auto x = mx::array(1.5);
  auto dfdx = grad_fn(x);
  // dfdx is 2 * x

  // Get the second derivative by composing grad with grad
-  auto d2fdx2 = grad(grad(fn))(x);
+  auto d2fdx2 = mx::grad(mx::grad(fn))(x);
  // d2fdx2 is 2
 }

--- a/examples/export/CMakeLists.txt
+++ b/examples/export/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.27)
+
+project(import_mlx LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(
+  Python 3.9
+  COMPONENTS Interpreter Development.Module
+  REQUIRED)
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE MLX_ROOT)
+find_package(MLX CONFIG REQUIRED)
+
+add_executable(eval_mlp eval_mlp.cpp)
+target_link_libraries(eval_mlp PRIVATE mlx)
+
+add_executable(train_mlp train_mlp.cpp)
+target_link_libraries(train_mlp PRIVATE mlx)
--- a/examples/export/README.md
+++ b/examples/export/README.md
@@ -0,0 +1,49 @@
+## Setup
+
+Install MLX:
+
+```bash
+pip install mlx>=0.22
+```
+
+Build the C++ examples:
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build
+```
+
+## Run
+
+### Eval MLP
+
+Run the Python script to export the eval function:
+
+```bash
+python eval_mlp.py
+```
+
+Then run the C++ program to import and run the function:
+
+```
+./build/eval_mlp
+```
+
+The Python and C++ programs should output the same result.
+
+### Train MLP
+
+Run the Python script to export the model initialization and training
+functions:
+
+```bash
+python train_mlp.py
+```
+
+Then run the C++ program to import and run the functions:
+
+```
+./build/train_mlp
+```
+
+The Python and C++ programs should output the same results.
--- a/examples/export/eval_mlp.cpp
+++ b/examples/export/eval_mlp.cpp
@@ -0,0 +1,25 @@
+// Copyright © 2024 Apple Inc.
+
+#include <mlx/mlx.h>
+#include <iostream>
+
+namespace mx = mlx::core;
+
+int main() {
+  int batch_size = 8;
+  int input_dim = 32;
+
+  // Make the input
+  mx::random::seed(42);
+  auto example_x = mx::random::uniform({batch_size, input_dim});
+
+  // Import the function
+  auto forward = mx::import_function("eval_mlp.mlxfn");
+
+  // Call the imported function
+  auto out = forward({example_x})[0];
+
+  std::cout << out << std::endl;
+
+  return 0;
+}
--- a/examples/export/eval_mlp.py
+++ b/examples/export/eval_mlp.py
@@ -0,0 +1,52 @@
+# Copyright © 2024 Apple Inc.
+
+import mlx.core as mx
+import mlx.nn as nn
+import mlx.utils
+
+
+class MLP(nn.Module):
+    """A simple MLP."""
+
+    def __init__(
+        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
+    ):
+        super().__init__()
+        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
+        self.layers = [
+            nn.Linear(idim, odim)
+            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
+        ]
+
+    def __call__(self, x):
+        for l in self.layers[:-1]:
+            x = nn.relu(l(x))
+        return self.layers[-1](x)
+
+
+if __name__ == "__main__":
+
+    batch_size = 8
+    input_dim = 32
+    output_dim = 10
+
+    # Load the model
+    mx.random.seed(0)  # Seed for params
+    model = MLP(num_layers=5, input_dim=input_dim, hidden_dim=64, output_dim=output_dim)
+    mx.eval(model)
+
+    # Note, the model parameters are saved in the export function
+    def forward(x):
+        return model(x)
+
+    mx.random.seed(42)  # Seed for input
+    example_x = mx.random.uniform(shape=(batch_size, input_dim))
+
+    mx.export_function("eval_mlp.mlxfn", forward, example_x)
+
+    # Import in Python
+    imported_forward = mx.import_function("eval_mlp.mlxfn")
+    expected = forward(example_x)
+    (out,) = imported_forward(example_x)
+    assert mx.allclose(expected, out)
+    print(out)
--- a/examples/export/train_mlp.cpp
+++ b/examples/export/train_mlp.cpp
@@ -0,0 +1,35 @@
+// Copyright © 2024 Apple Inc.
+
+#include <mlx/mlx.h>
+#include <iostream>
+
+namespace mx = mlx::core;
+
+int main() {
+  int batch_size = 8;
+  int input_dim = 32;
+  int output_dim = 10;
+
+  auto state = mx::import_function("init_mlp.mlxfn")({});
+
+  // Make the input
+  mx::random::seed(42);
+  auto example_X = mx::random::normal({batch_size, input_dim});
+  auto example_y = mx::random::randint(0, output_dim, {batch_size});
+
+  // Import the function
+  auto step = mx::import_function("train_mlp.mlxfn");
+
+  // Call the imported function
+  for (int it = 0; it < 100; ++it) {
+    state.insert(state.end(), {example_X, example_y});
+    state = step(state);
+    eval(state);
+    auto loss = state.back();
+    state.pop_back();
+    if (it % 10 == 0) {
+      std::cout << "Loss " << loss.item<float>() << std::endl;
+    }
+  }
+  return 0;
+}
--- a/examples/export/train_mlp.py
+++ b/examples/export/train_mlp.py
@@ -0,0 +1,76 @@
+# Copyright © 2024 Apple Inc.
+
+import mlx.core as mx
+import mlx.nn as nn
+import mlx.optimizers as optim
+import mlx.utils
+
+
+class MLP(nn.Module):
+    """A simple MLP."""
+
+    def __init__(
+        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
+    ):
+        super().__init__()
+        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
+        self.layers = [
+            nn.Linear(idim, odim)
+            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
+        ]
+
+    def __call__(self, x):
+        for l in self.layers[:-1]:
+            x = nn.relu(l(x))
+        return self.layers[-1](x)
+
+
+if __name__ == "__main__":
+
+    batch_size = 8
+    input_dim = 32
+    output_dim = 10
+
+    def init():
+        # Seed for the parameter initialization
+        mx.random.seed(0)
+        model = MLP(
+            num_layers=3, input_dim=input_dim, hidden_dim=64, output_dim=output_dim
+        )
+        optimizer = optim.SGD(learning_rate=1e-1)
+        optimizer.init(model.parameters())
+        state = [model.parameters(), optimizer.state]
+        tree_structure, state = zip(*mlx.utils.tree_flatten(state))
+        return model, optimizer, tree_structure, state
+
+    # Export the model parameter initialization
+    model, optimizer, tree_structure, state = init()
+    mx.eval(state)
+    mx.export_function("init_mlp.mlxfn", lambda: init()[-1])
+
+    def loss_fn(params, X, y):
+        model.update(params)
+        return nn.losses.cross_entropy(model(X), y, reduction="mean")
+
+    def step(*inputs):
+        *state, X, y = inputs
+        params, opt_state = mlx.utils.tree_unflatten(list(zip(tree_structure, state)))
+        optimizer.state = opt_state
+        loss, grads = mx.value_and_grad(loss_fn)(params, X, y)
+        params = optimizer.apply_gradients(grads, params)
+        _, state = zip(*mlx.utils.tree_flatten([params, optimizer.state]))
+        return *state, loss
+
+    # Make some random data
+    mx.random.seed(42)
+    example_X = mx.random.normal(shape=(batch_size, input_dim))
+    example_y = mx.random.randint(low=0, high=output_dim, shape=(batch_size,))
+    mx.export_function("train_mlp.mlxfn", step, *state, example_X, example_y)
+
+    # Export one step of SGD
+    imported_step = mx.import_function("train_mlp.mlxfn")
+
+    for it in range(100):
+        *state, loss = imported_step(*state, example_X, example_y)
+        if it % 10 == 0:
+            print(f"Loss {loss.item():.6}")
--- a/examples/extensions/CMakeLists.txt
+++ b/examples/extensions/CMakeLists.txt
@@ -18,8 +18,7 @@ find_package(
 execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
-  OUTPUT_VARIABLE NB_DIR)
-list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
+  OUTPUT_VARIABLE nanobind_ROOT)
 find_package(nanobind CONFIG REQUIRED)

 # ----------------------------- Extensions -----------------------------
--- a/examples/extensions/axpby/axpby.cpp
+++ b/examples/extensions/axpby/axpby.cpp
@@ -19,7 +19,7 @@
 #include "mlx/backend/metal/utils.h"
 #endif

-namespace mlx::core {
+namespace my_ext {

 ///////////////////////////////////////////////////////////////////////////////
 // Operation Implementation
@@ -32,24 +32,24 @@ namespace mlx::core {
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
-array axpby(
-    const array& x, // Input array x
-    const array& y, // Input array y
+mx::array axpby(
+    const mx::array& x, // Input mx::array x
+    const mx::array& y, // Input mx::array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
-    StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
+    mx::StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
 ) {
  // Promote dtypes between x and y as needed
  auto promoted_dtype = promote_types(x.dtype(), y.dtype());

  // Upcast to float32 for non-floating point inputs x and y
-  auto out_dtype = issubdtype(promoted_dtype, float32)
+  auto out_dtype = mx::issubdtype(promoted_dtype, mx::float32)
      ? promoted_dtype
-      : promote_types(promoted_dtype, float32);
+      : promote_types(promoted_dtype, mx::float32);

  // Cast x and y up to the determined dtype (on the same stream s)
-  auto x_casted = astype(x, out_dtype, s);
-  auto y_casted = astype(y, out_dtype, s);
+  auto x_casted = mx::astype(x, out_dtype, s);
+  auto y_casted = mx::astype(y, out_dtype, s);

  // Broadcast the shapes of x and y (on the same stream s)
  auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
@@ -57,12 +57,12 @@ array axpby(

  // Construct the array as the output of the Axpby primitive
  // with the broadcasted and upcasted arrays as inputs
-  return array(
-      /* const std::vector<int>& shape = */ out_shape,
-      /* Dtype dtype = */ out_dtype,
-      /* std::unique_ptr<Primitive> primitive = */
+  return mx::array(
+      /* const mx::Shape& shape = */ out_shape,
+      /* mx::Dtype dtype = */ out_dtype,
+      /* std::shared_ptr<mx::Primitive> primitive = */
      std::make_shared<Axpby>(to_stream(s), alpha, beta),
-      /* const std::vector<array>& inputs = */ broadcasted_inputs);
+      /* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -71,16 +71,16 @@ array axpby(

 template <typename T>
 void axpby_impl(
-    const array& x,
-    const array& y,
-    array& out,
+    const mx::array& x,
+    const mx::array& y,
+    mx::array& out,
    float alpha_,
    float beta_) {
  // We only allocate memory when we are ready to fill the output
  // malloc_or_wait synchronously allocates available memory
  // There may be a wait executed here if the allocation is requested
  // under memory-pressured conditions
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));

  // Collect input and output data pointers
  const T* x_ptr = x.data<T>();
@@ -94,8 +94,8 @@ void axpby_impl(
  // Do the element-wise operation for each output
  for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
    // Map linear indices to offsets in x and y
-    auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
-    auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
+    auto x_offset = mx::elem_to_loc(out_idx, x.shape(), x.strides());
+    auto y_offset = mx::elem_to_loc(out_idx, y.shape(), y.strides());

    // We allocate the output to be contiguous and regularly strided
    // (defaults to row major) and hence it doesn't need additional mapping
@@ -105,8 +105,8 @@ void axpby_impl(

 /** Fall back implementation for evaluation on CPU */
 void Axpby::eval(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+    const std::vector<mx::array>& inputs,
+    std::vector<mx::array>& outputs) {
  // Check the inputs (registered in the op while constructing the out array)
  assert(inputs.size() == 2);
  auto& x = inputs[0];
@@ -114,14 +114,14 @@ void Axpby::eval(
  auto& out = outputs[0];

  // Dispatch to the correct dtype
-  if (out.dtype() == float32) {
+  if (out.dtype() == mx::float32) {
    return axpby_impl<float>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == float16) {
-    return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == bfloat16) {
-    return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
-  } else if (out.dtype() == complex64) {
-    return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == mx::float16) {
+    return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == mx::bfloat16) {
+    return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_);
+  } else if (out.dtype() == mx::complex64) {
+    return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_);
  } else {
    throw std::runtime_error(
        "Axpby is only supported for floating point types.");
@@ -136,9 +136,9 @@ void Axpby::eval(

 template <typename T>
 void axpby_impl_accelerate(
-    const array& x,
-    const array& y,
-    array& out,
+    const mx::array& x,
+    const mx::array& y,
+    mx::array& out,
    float alpha_,
    float beta_) {
  // Accelerate library provides catlas_saxpby which does
@@ -150,10 +150,10 @@ void axpby_impl_accelerate(
  // The data in the output array is allocated to match the strides in y
  // such that x, y, and out are contiguous in the same mode and
  // no transposition is needed
-  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));

  // We then copy over the elements using the contiguous vector specialization
-  copy_inplace(y, out, CopyType::Vector);
+  copy_inplace(y, out, mx::CopyType::Vector);

  // Get x and y pointers for catlas_saxpby
  const T* x_ptr = x.data<T>();
@@ -175,15 +175,15 @@ void axpby_impl_accelerate(

 /** Evaluate primitive on CPU using accelerate specializations */
 void Axpby::eval_cpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+    const std::vector<mx::array>& inputs,
+    std::vector<mx::array>& outputs) {
  assert(inputs.size() == 2);
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];

  // Accelerate specialization for contiguous single precision float arrays
-  if (out.dtype() == float32 &&
+  if (out.dtype() == mx::float32 &&
      ((x.flags().row_contiguous && y.flags().row_contiguous) ||
       (x.flags().col_contiguous && y.flags().col_contiguous))) {
    axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
@@ -198,8 +198,8 @@ void Axpby::eval_cpu(

 /** Evaluate primitive on CPU falling back to common backend */
 void Axpby::eval_cpu(
-    const std::vector<array>& inputs,
-    const std::vector<array>& outputs) {
+    const std::vector<mx::array>& inputs,
+    std::vector<mx::array>& outputs) {
  eval(inputs, outputs);
 }

@@ -213,8 +213,8 @@ void Axpby::eval_cpu(

 /** Evaluate primitive on GPU */
 void Axpby::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& outputs) {
+    const std::vector<mx::array>& inputs,
+    std::vector<mx::array>& outputs) {
  // Prepare inputs
  assert(inputs.size() == 2);
  auto& x = inputs[0];
@@ -225,7 +225,7 @@ void Axpby::eval_gpu(
  // and each stream carries its device identifiers
  auto& s = stream();
  // We get the needed metal device using the stream
-  auto& d = metal::device(s.device);
+  auto& d = mx::metal::device(s.device);

  // Prepare to specialize based on contiguity
  bool contiguous_kernel =
@@ -235,12 +235,12 @@ void Axpby::eval_gpu(
  // Allocate output memory with strides based on specialization
  if (contiguous_kernel) {
    out.set_data(
-        allocator::malloc_or_wait(x.data_size() * out.itemsize()),
+        mx::allocator::malloc_or_wait(x.data_size() * out.itemsize()),
        x.data_size(),
        x.strides(),
        x.flags());
  } else {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
  }

  // Resolve name of kernel (corresponds to axpby.metal)
@@ -279,7 +279,7 @@ void Axpby::eval_gpu(
  if (!contiguous_kernel) {
    compute_encoder.set_vector_bytes(x.shape(), 5);
    compute_encoder.set_vector_bytes(x.strides(), 6);
-    compute_encoder.set_bytes(y.strides(), 7);
+    compute_encoder.set_vector_bytes(y.strides(), 7);
    compute_encoder.set_bytes(ndim, 8);
  }

@@ -302,8 +302,8 @@ void Axpby::eval_gpu(

 /** Fail evaluation on GPU */
 void Axpby::eval_gpu(
-    const std::vector<array>& inputs,
-    std::vector<array>& out) {
+    const std::vector<mx::array>& inputs,
+    std::vector<mx::array>& out) {
  throw std::runtime_error("Axpby has no GPU implementation.");
 }

@@ -314,9 +314,9 @@ void Axpby::eval_gpu(
 ///////////////////////////////////////////////////////////////////////////////

 /** The Jacobian-vector product. */
-std::vector<array> Axpby::jvp(
-    const std::vector<array>& primals,
-    const std::vector<array>& tangents,
+std::vector<mx::array> Axpby::jvp(
+    const std::vector<mx::array>& primals,
+    const std::vector<mx::array>& tangents,
    const std::vector<int>& argnums) {
  // Forward mode diff that pushes along the tangents
  // The jvp transform on the primitive can built with ops
@@ -328,8 +328,8 @@ std::vector<array> Axpby::jvp(
  // scaled by beta
  if (argnums.size() > 1) {
    auto scale = argnums[0] == 0 ? alpha_ : beta_;
-    auto scale_arr = array(scale, tangents[0].dtype());
-    return {multiply(scale_arr, tangents[0], stream())};
+    auto scale_arr = mx::array(scale, tangents[0].dtype());
+    return {mx::multiply(scale_arr, tangents[0], stream())};
  }
  // If, argnums = {0, 1}, we take contributions from both
  // which gives us jvp = tangent_x * alpha + tangent_y * beta
@@ -339,24 +339,24 @@ std::vector<array> Axpby::jvp(
 }

 /** The vector-Jacobian product. */
-std::vector<array> Axpby::vjp(
-    const std::vector<array>& primals,
-    const std::vector<array>& cotangents,
+std::vector<mx::array> Axpby::vjp(
+    const std::vector<mx::array>& primals,
+    const std::vector<mx::array>& cotangents,
    const std::vector<int>& argnums,
-    const std::vector<array>&) {
+    const std::vector<mx::array>&) {
  // Reverse mode diff
-  std::vector<array> vjps;
+  std::vector<mx::array> vjps;
  for (auto arg : argnums) {
    auto scale = arg == 0 ? alpha_ : beta_;
-    auto scale_arr = array(scale, cotangents[0].dtype());
-    vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
+    auto scale_arr = mx::array(scale, cotangents[0].dtype());
+    vjps.push_back(mx::multiply(scale_arr, cotangents[0], stream()));
  }
  return vjps;
 }

 /** Vectorize primitive along given axis */
-std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
-    const std::vector<array>& inputs,
+std::pair<std::vector<mx::array>, std::vector<int>> Axpby::vmap(
+    const std::vector<mx::array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Axpby has no vmap implementation.");
 }
@@ -367,4 +367,4 @@ bool Axpby::is_equivalent(const Primitive& other) const {
  return alpha_ == r_other.alpha_ && beta_ == r_other.beta_;
 }

-} // namespace mlx::core
+} // namespace my_ext
--- a/examples/extensions/axpby/axpby.h
+++ b/examples/extensions/axpby/axpby.h
@@ -5,7 +5,9 @@
 #include "mlx/ops.h"
 #include "mlx/primitives.h"

-namespace mlx::core {
+namespace mx = mlx::core;
+
+namespace my_ext {

 ///////////////////////////////////////////////////////////////////////////////
 // Operation
@@ -18,22 +20,22 @@ namespace mlx::core {
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
-array axpby(
-    const array& x, // Input array x
-    const array& y, // Input array y
+mx::array axpby(
+    const mx::array& x, // Input array x
+    const mx::array& y, // Input array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
-    StreamOrDevice s = {} // Stream on which to schedule the operation
+    mx::StreamOrDevice s = {} // Stream on which to schedule the operation
 );

 ///////////////////////////////////////////////////////////////////////////////
 // Primitive
 ///////////////////////////////////////////////////////////////////////////////

-class Axpby : public Primitive {
+class Axpby : public mx::Primitive {
 public:
-  explicit Axpby(Stream stream, float alpha, float beta)
-      : Primitive(stream), alpha_(alpha), beta_(beta) {};
+  explicit Axpby(mx::Stream stream, float alpha, float beta)
+      : mx::Primitive(stream), alpha_(alpha), beta_(beta) {};

  /**
   * A primitive must know how to evaluate itself on the CPU/GPU
@@ -42,23 +44,25 @@ class Axpby : public Primitive {
   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
-  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
-      override;
-  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
-      override;
+  void eval_cpu(
+      const std::vector<mx::array>& inputs,
+      std::vector<mx::array>& outputs) override;
+  void eval_gpu(
+      const std::vector<mx::array>& inputs,
+      std::vector<mx::array>& outputs) override;

  /** The Jacobian-vector product. */
-  std::vector<array> jvp(
-      const std::vector<array>& primals,
-      const std::vector<array>& tangents,
+  std::vector<mx::array> jvp(
+      const std::vector<mx::array>& primals,
+      const std::vector<mx::array>& tangents,
      const std::vector<int>& argnums) override;

  /** The vector-Jacobian product. */
-  std::vector<array> vjp(
-      const std::vector<array>& primals,
-      const std::vector<array>& cotangents,
+  std::vector<mx::array> vjp(
+      const std::vector<mx::array>& primals,
+      const std::vector<mx::array>& cotangents,
      const std::vector<int>& argnums,
-      const std::vector<array>& outputs) override;
+      const std::vector<mx::array>& outputs) override;

  /**
   * The primitive must know how to vectorize itself across
@@ -66,8 +70,8 @@ class Axpby : public Primitive {
   * representing the vectorized computation and the axis which
   * corresponds to the output vectorized dimension.
   */
-  std::pair<std::vector<array>, std::vector<int>> vmap(
-      const std::vector<array>& inputs,
+  std::pair<std::vector<mx::array>, std::vector<int>> vmap(
+      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

  /** Print the primitive. */
@@ -76,14 +80,16 @@ class Axpby : public Primitive {
  }

  /** Equivalence check **/
-  bool is_equivalent(const Primitive& other) const override;
+  bool is_equivalent(const mx::Primitive& other) const override;

 private:
  float alpha_;
  float beta_;

  /** Fall back implementation for evaluation on CPU */
-  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
+  void eval(
+      const std::vector<mx::array>& inputs,
+      std::vector<mx::array>& outputs);
 };

-} // namespace mlx::core
+} // namespace my_ext
--- a/examples/extensions/axpby/axpby.metal
+++ b/examples/extensions/axpby/axpby.metal
@@ -12,8 +12,8 @@ template <typename T>
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    constant const int* shape [[buffer(5)]],
-    constant const size_t* x_strides [[buffer(6)]],
-    constant const size_t* y_strides [[buffer(7)]],
+    constant const int64_t* x_strides [[buffer(6)]],
+    constant const int64_t* y_strides [[buffer(7)]],
    constant const int& ndim [[buffer(8)]],
    uint index [[thread_position_in_grid]]) {
  auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
@@ -34,29 +34,14 @@ template <typename T>
      static_cast<T>(alpha) * x[index] + static_cast<T>(beta) * y[index];
 }

-#define instantiate_axpby(type_name, type)                               \
-  template [[host_name("axpby_general_" #type_name)]] [[kernel]] void    \
-  axpby_general<type>(                                                   \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      constant const int* shape [[buffer(5)]],                           \
-      constant const size_t* x_strides [[buffer(6)]],                    \
-      constant const size_t* y_strides [[buffer(7)]],                    \
-      constant const int& ndim [[buffer(8)]],                            \
-      uint index [[thread_position_in_grid]]);                           \
-  template [[host_name("axpby_contiguous_" #type_name)]] [[kernel]] void \
-  axpby_contiguous<type>(                                                \
-      device const type* x [[buffer(0)]],                                \
-      device const type* y [[buffer(1)]],                                \
-      device type* out [[buffer(2)]],                                    \
-      constant const float& alpha [[buffer(3)]],                         \
-      constant const float& beta [[buffer(4)]],                          \
-      uint index [[thread_position_in_grid]]);
+// clang-format off
+#define instantiate_axpby(type_name, type)                             \
+  instantiate_kernel("axpby_general_" #type_name, axpby_general, type) \
+  instantiate_kernel(                                                  \
+          "axpby_contiguous_" #type_name, axpby_contiguous, type)

 instantiate_axpby(float32, float);
 instantiate_axpby(float16, half);
 instantiate_axpby(bfloat16, bfloat16_t);
 instantiate_axpby(complex64, complex64_t);
+// clang-format on
--- a/examples/extensions/bindings.cpp
+++ b/examples/extensions/bindings.cpp
@@ -8,14 +8,12 @@
 namespace nb = nanobind;
 using namespace nb::literals;

-using namespace mlx::core;
-
 NB_MODULE(_ext, m) {
  m.doc() = "Sample extension for MLX";

  m.def(
      "axpby",
-      &axpby,
+      &my_ext::axpby,
      "x"_a,
      "y"_a,
      "alpha"_a,
--- a/examples/extensions/pyproject.toml
+++ b/examples/extensions/pyproject.toml
@@ -1,8 +1,8 @@
 [build-system]
 requires = [
  "setuptools>=42",
-  "cmake>=3.24",
+  "cmake>=3.25",
  "mlx>=0.18.0",
-  "nanobind==2.2.0",
+  "nanobind==2.4.0",
 ]
 build-backend = "setuptools.build_meta"
--- a/examples/extensions/requirements.txt
+++ b/examples/extensions/requirements.txt
@@ -1,4 +1,4 @@
 setuptools>=42
-cmake>=3.24
+cmake>=3.25
 mlx>=0.21.0
 nanobind==2.2.0
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
@@ -18,6 +19,16 @@ target_sources(
          ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

+if(MSVC)
+  # Disable some MSVC warnings to speed up compilation.
+  target_compile_options(mlx PUBLIC /wd4068 /wd4244 /wd4267 /wd4804)
+endif()
+
+if(WIN32)
+  # Export symbols by default to behave like macOS/linux.
+  set_target_properties(mlx PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+endif()
+
 if(MLX_BUILD_CPU)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)
 else()
--- a/mlx/array.cpp
+++ b/mlx/array.cpp
@@ -10,22 +10,8 @@

 namespace mlx::core {

-namespace {
-
-/** Return true if we are currently performing a function transformation in
- * order to keep the graph when evaluating tracer arrays. */
-bool in_tracing() {
-  return detail::InTracing::in_tracing();
-}
-
-bool retain_graph() {
-  return detail::RetainGraph::retain_graph();
-}
-
-} // namespace
-
 array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
-    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
  auto cval = static_cast<complex64_t>(val);
  init(&cval);
 }
@@ -61,14 +47,14 @@ std::vector<array> array::make_arrays(

 array::array(std::initializer_list<float> data)
    : array_desc_(std::make_shared<ArrayDesc>(
-          std::vector<int>{static_cast<int>(data.size())},
+          Shape{static_cast<ShapeElem>(data.size())},
          float32)) {
  init(data.begin());
 }

 array::array(std::initializer_list<int> data, Dtype dtype)
    : array_desc_(std::make_shared<ArrayDesc>(
-          std::vector<int>{static_cast<int>(data.size())},
+          Shape{static_cast<ShapeElem>(data.size())},
          dtype)) {
  init(data.begin());
 }
@@ -119,7 +105,8 @@ void array::eval() {
 }

 bool array::is_tracer() const {
-  return array_desc_->is_tracer && in_tracing() || retain_graph();
+  return (array_desc_->is_tracer && detail::in_tracing()) ||
+      detail::retain_graph();
 }

 void array::set_data(allocator::Buffer buffer, Deleter d) {
@@ -277,7 +264,19 @@ array::ArrayDesc::~ArrayDesc() {
    }
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
-      if (a.array_desc_.use_count() <= a.siblings().size() + 1) {
+      bool is_deletable =
+          (a.array_desc_.use_count() <= a.siblings().size() + 1);
+      // An array with siblings is deletable only if all of its siblings
+      // are deletable
+      for (auto& s : a.siblings()) {
+        if (!is_deletable) {
+          break;
+        }
+        int is_input = (input_map.find(s.id()) != input_map.end());
+        is_deletable &=
+            s.array_desc_.use_count() <= a.siblings().size() + is_input;
+      }
+      if (is_deletable) {
        for_deletion.push_back(std::move(a.array_desc_));
      }
    }
@@ -310,7 +309,7 @@ array::ArrayIterator::ArrayIterator(const array& arr, int idx)
 }

 array::ArrayIterator::reference array::ArrayIterator::operator*() const {
-  auto start = std::vector<int>(arr.ndim(), 0);
+  auto start = Shape(arr.ndim(), 0);
  auto end = arr.shape();
  auto shape = arr.shape();
  shape.erase(shape.begin());
--- a/mlx/array.h
+++ b/mlx/array.h
@@ -17,8 +17,9 @@ namespace mlx::core {
 class Primitive;

 using Deleter = std::function<void(allocator::Buffer)>;
-using Shape = std::vector<int32_t>;
-using Strides = std::vector<size_t>;
+using ShapeElem = int32_t;
+using Shape = std::vector<ShapeElem>;
+using Strides = std::vector<int64_t>;

 class array {
  /* An array is really a node in a graph. It contains a shared ArrayDesc
@@ -34,29 +35,29 @@ class array {
  explicit array(const std::complex<float>& val, Dtype dtype = complex64);

  template <typename It>
-  array(
+  explicit array(
      It data,
      Shape shape,
      Dtype dtype =
          TypeToDtype<typename std::iterator_traits<It>::value_type>());

  template <typename T>
-  array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());
+  explicit array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());

  /* Special case so empty lists default to float32. */
-  array(std::initializer_list<float> data);
+  explicit array(std::initializer_list<float> data);

  /* Special case so array({}, type) is an empty array. */
-  array(std::initializer_list<int> data, Dtype dtype);
+  explicit array(std::initializer_list<int> data, Dtype dtype);

  template <typename T>
-  array(
+  explicit array(
      std::initializer_list<T> data,
      Shape shape,
      Dtype dtype = TypeToDtype<T>());

  /* Build an array from a buffer */
-  array(
+  explicit array(
      allocator::Buffer data,
      Shape shape,
      Dtype dtype,
@@ -498,7 +499,7 @@ class array {

 template <typename T>
 array::array(T val, Dtype dtype /* = TypeToDtype<T>() */)
-    : array_desc_(std::make_shared<ArrayDesc>(std::vector<int>{}, dtype)) {
+    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
  init(&val);
 }

@@ -516,7 +517,7 @@ array::array(
    std::initializer_list<T> data,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(
-          std::vector<int>{static_cast<int>(data.size())},
+          Shape{static_cast<ShapeElem>(data.size())},
          dtype)) {
  init(data.begin());
 }
--- a/mlx/backend/accelerate/primitives.cpp
+++ b/mlx/backend/accelerate/primitives.cpp
@@ -32,6 +32,7 @@ DEFAULT(ArgSort)
 DEFAULT(AsStrided)
 DEFAULT(BlockMaskedMM)
 DEFAULT(Broadcast)
+DEFAULT(BroadcastAxes)
 DEFAULT(Ceil)
 DEFAULT(Concatenate)
 DEFAULT(Conjugate)
@@ -43,6 +44,7 @@ DEFAULT(NumberOfElements)
 DEFAULT(Equal)
 DEFAULT(Erf)
 DEFAULT(ErfInv)
+DEFAULT(ExpandDims)
 DEFAULT(FFT)
 DEFAULT(Floor)
 DEFAULT(Gather)
@@ -65,7 +67,6 @@ DEFAULT(Pad)
 DEFAULT(Partition)
 DEFAULT_MULTI(QRF)
 DEFAULT(RandomBits)
-DEFAULT(Reshape)
 DEFAULT(Remainder)
 DEFAULT(Round)
 DEFAULT(Scatter)
@@ -76,6 +77,7 @@ DEFAULT(Slice)
 DEFAULT(SliceUpdate)
 DEFAULT_MULTI(Split)
 DEFAULT(Sort)
+DEFAULT(Squeeze)
 DEFAULT(StopGradient)
 DEFAULT_MULTI(SVD)
 DEFAULT(Transpose)
--- a/mlx/backend/common/CMakeLists.txt
+++ b/mlx/backend/common/CMakeLists.txt
@@ -5,13 +5,21 @@ else()
  set(COMPILER ${CMAKE_CXX_COMPILER})
 endif()

+if(MSVC)
+  set(SHELL_EXT ps1)
+  set(SHELL_CMD powershell -ExecutionPolicy Bypass -File)
+else()
+  set(SHELL_EXT sh)
+  set(SHELL_CMD /bin/bash)
+endif()
+
 add_custom_command(
  OUTPUT compiled_preamble.cpp
  COMMAND
-    /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
+    ${SHELL_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.${SHELL_EXT}
    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
-    ${PROJECT_SOURCE_DIR} ${CLANG}
-  DEPENDS make_compiled_preamble.sh
+    ${PROJECT_SOURCE_DIR} ${CLANG} ${CMAKE_SYSTEM_PROCESSOR}
+  DEPENDS make_compiled_preamble.${SHELL_EXT}
          compiled_preamble.h
          ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
          ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
@@ -58,5 +66,6 @@ target_sources(
 if(IOS)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_nocpu.cpp)
 else()
-  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp)
+  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled_cpu.cpp
+                             ${CMAKE_CURRENT_SOURCE_DIR}/jit_compiler.cpp)
 endif()
--- a/mlx/backend/common/arg_reduce.cpp
+++ b/mlx/backend/common/arg_reduce.cpp
@@ -13,8 +13,8 @@ template <typename InT, typename OpT>
 void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
-  std::vector<size_t> strides = in.strides();
-  std::vector<int> shape = in.shape();
+  Strides strides = in.strides();
+  Shape shape = in.shape();
  strides.erase(strides.begin() + axis);
  shape.erase(shape.begin() + axis);
  for (uint32_t i = 0; i < out.size(); ++i) {
--- a/mlx/backend/common/binary.h
+++ b/mlx/backend/common/binary.h
@@ -28,8 +28,8 @@ BinaryOpType get_binary_op_type(const array& a, const array& b) {
  } else if (b.data_size() == 1 && a.flags().contiguous) {
    bopt = BinaryOpType::VectorScalar;
  } else if (
-      a.flags().row_contiguous && b.flags().row_contiguous ||
-      a.flags().col_contiguous && b.flags().col_contiguous) {
+      (a.flags().row_contiguous && b.flags().row_contiguous) ||
+      (a.flags().col_contiguous && b.flags().col_contiguous)) {
    bopt = BinaryOpType::VectorVector;
  } else {
    bopt = BinaryOpType::General;
@@ -178,10 +178,10 @@ void binary_op_dims(
    const T* b,
    U* out,
    Op op,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
@@ -212,10 +212,10 @@ void binary_op_dispatch_dims(
    array& out,
    Op op,
    int dim,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides) {
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides) {
  const T* a_ptr = a.data<T>();
  const T* b_ptr = b.data<T>();
  U* out_ptr = out.data<U>();
@@ -258,10 +258,10 @@ void binary_op_dispatch_dims(
      return;
  }

-  ContiguousIterator<size_t> a_it(shape, a_strides, dim - 3);
-  ContiguousIterator<size_t> b_it(shape, b_strides, dim - 3);
-  size_t stride = out_strides[dim - 4];
-  for (size_t elem = 0; elem < a.size(); elem += stride) {
+  ContiguousIterator a_it(shape, a_strides, dim - 3);
+  ContiguousIterator b_it(shape, b_strides, dim - 3);
+  auto stride = out_strides[dim - 4];
+  for (int64_t elem = 0; elem < a.size(); elem += stride) {
    binary_op_dims<T, U, Op, 3, Strided>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
@@ -327,7 +327,7 @@ void binary_op(
  const auto& strides = new_strides[2];

  // Get the left-most dim such that the array is row contiguous after
-  auto leftmost_rc_dim = [&strides](const std::vector<size_t>& arr_strides) {
+  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
    int d = arr_strides.size() - 1;
    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
    }
@@ -337,7 +337,7 @@ void binary_op(
  auto b_rc_dim = leftmost_rc_dim(b_strides);

  // Get the left-most dim such that the array is a broadcasted "scalar" after
-  auto leftmost_s_dim = [](const std::vector<size_t>& arr_strides) {
+  auto leftmost_s_dim = [](const auto& arr_strides) {
    int d = arr_strides.size() - 1;
    for (; d >= 0 && arr_strides[d] == 0; d--) {
    }
--- a/mlx/backend/common/binary_two.h
+++ b/mlx/backend/common/binary_two.h
@@ -16,10 +16,10 @@ void binary_op_dims(
    U* out_a,
    U* out_b,
    Op op,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& out_strides,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
@@ -96,9 +96,9 @@ void binary_op_dispatch_dims(
      return;
  }

-  ContiguousIterator<size_t> a_it(shape, a_strides, ndim - 2);
-  ContiguousIterator<size_t> b_it(shape, b_strides, ndim - 2);
-  size_t stride = out_strides[ndim - 3];
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
  for (size_t elem = 0; elem < a.size(); elem += stride) {
    binary_op_dims<T, U, Op, 2>(
        a_ptr + a_it.loc,
--- a/mlx/backend/common/common.cpp
+++ b/mlx/backend/common/common.cpp
@@ -42,14 +42,12 @@ void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  return move_or_copy(in, out, strides_, flags, data_size, offset_);
 }

-void Broadcast::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
+void broadcast(const array& in, array& out) {
  if (out.size() == 0) {
    out.set_data(nullptr);
    return;
  }
-  std::vector<size_t> strides(out.ndim(), 0);
+  Strides strides(out.ndim(), 0);
  int diff = out.ndim() - in.ndim();
  for (int i = in.ndim() - 1; i >= 0; --i) {
    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
@@ -61,6 +59,14 @@ void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  move_or_copy(in, out, strides, flags, in.data_size());
 }

+void Broadcast::eval(const std::vector<array>& inputs, array& out) {
+  broadcast(inputs[0], out);
+}
+
+void BroadcastAxes::eval(const std::vector<array>& inputs, array& out) {
+  broadcast(inputs[0], out);
+}
+
 void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  move_or_copy(inputs[0], out);
@@ -85,6 +91,16 @@ void Depends::eval(
  }
 }

+void ExpandDims::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  auto strides = in.strides();
+  for (auto ax : axes_) {
+    strides.insert(strides.begin() + ax, 1);
+  }
+  move_or_copy(in, out, strides, in.flags(), in.data_size());
+}
+
 void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.set_data(allocator::malloc_or_wait(out.nbytes()));
@@ -141,9 +157,7 @@ void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  }
 }

-std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
-    const array& in,
-    const array& out) {
+std::pair<bool, Strides> prepare_reshape(const array& in, const array& out) {
  // Special case for empty arrays or row contiguous arrays
  if (in.size() == 0 || in.flags().row_contiguous) {
    return {false, out.strides()};
@@ -151,8 +165,7 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(

  // Special case for scalars
  if (in.ndim() == 0) {
-    std::vector<size_t> out_strides(out.ndim(), 0);
-    return {false, out_strides};
+    return {false, Strides(out.ndim(), 0)};
  }

  // Firstly let's collapse all the contiguous dimensions of the input
@@ -160,7 +173,7 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(

  // If shapes fit exactly in the contiguous dims then no copy is necessary so
  // let's check.
-  std::vector<size_t> out_strides;
+  Strides out_strides;
  bool copy_necessary = false;
  int j = 0;
  for (int i = 0; i < out.ndim(); i++) {
@@ -181,9 +194,9 @@ std::pair<bool, std::vector<size_t>> Reshape::prepare_reshape(
  return {copy_necessary, out_strides};
 }

-void Reshape::shared_buffer_reshape(
+void shared_buffer_reshape(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    array& out) {
  auto flags = in.flags();
  if (flags.row_contiguous) {
@@ -249,16 +262,18 @@ void Split::eval(
  }
 }

-std::tuple<int64_t, std::vector<int64_t>> SliceUpdate::prepare_slice(
-    const array& in) {
-  int64_t data_offset = 0;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
-  for (int i = 0; i < in.ndim(); ++i) {
-    data_offset += start_indices_[i] * in.strides()[i];
-    inp_strides[i] = in.strides()[i] * strides_[i];
+void Squeeze::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  const auto& in = inputs[0];
+  Strides strides;
+  for (int i = 0, j = 0; i < in.ndim(); ++i) {
+    if (j < axes_.size() && i == axes_[j]) {
+      j++;
+    } else {
+      strides.push_back(in.strides(i));
+    }
  }
-
-  return std::make_tuple(data_offset, inp_strides);
+  move_or_copy(in, out, strides, in.flags(), in.data_size());
 }

 void StopGradient::eval(const std::vector<array>& inputs, array& out) {
@@ -268,7 +283,7 @@ void StopGradient::eval(const std::vector<array>& inputs, array& out) {

 void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
-  std::vector<size_t> out_strides(out.ndim());
+  Strides out_strides(out.ndim());
  auto& in = inputs[0];
  for (int ax = 0; ax < axes_.size(); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
@@ -285,8 +300,8 @@ void Transpose::eval(const std::vector<array>& inputs, array& out) {
  //   true, they stay true)
  auto flags = in.flags();
  if (flags.contiguous && in.data_size() == in.size()) {
-    size_t f_stride = 1;
-    size_t b_stride = 1;
+    int64_t f_stride = 1;
+    int64_t b_stride = 1;
    flags.col_contiguous = true;
    flags.row_contiguous = true;
    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
--- a/mlx/backend/common/compiled.cpp
+++ b/mlx/backend/common/compiled.cpp
@@ -130,7 +130,7 @@ std::string build_lib_name(

 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const std::vector<int>& shape) {
+    const Shape& shape) {
  bool contiguous = true;
  bool all_contig = true;
  bool all_row_contig = true;
@@ -165,7 +165,7 @@ void compiled_allocate_outputs(
    bool move_buffers /* = false */) {
  if (contiguous) {
    int o = 0;
-    std::vector<size_t> strides;
+    Strides strides;
    size_t data_size;
    array::Flags flags;
    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
--- a/mlx/backend/common/compiled.h
+++ b/mlx/backend/common/compiled.h
@@ -11,9 +11,7 @@
 namespace mlx::core {

 inline bool is_static_cast(const Primitive& p) {
-  return (
-      typeid(p) == typeid(Broadcast) || typeid(p) == typeid(Copy) ||
-      typeid(p) == typeid(StopGradient) || typeid(p) == typeid(AsType));
+  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
 }

 std::string build_lib_name(
@@ -56,7 +54,7 @@ inline bool is_scalar(const array& x) {
 // Check if we can use a contiguous operation given inputs and the output shape
 bool compiled_check_contiguity(
    const std::vector<array>& inputs,
-    const std::vector<int>& shape);
+    const Shape& shape);

 // Allocate space for the outputs possibly with input donation
 void compiled_allocate_outputs(
--- a/mlx/backend/common/compiled_cpu.cpp
+++ b/mlx/backend/common/compiled_cpu.cpp
@@ -7,8 +7,11 @@
 #include <mutex>
 #include <shared_mutex>

+#include <fmt/format.h>
+
 #include "mlx/backend/common/compiled.h"
 #include "mlx/backend/common/compiled_preamble.h"
+#include "mlx/backend/common/jit_compiler.h"
 #include "mlx/device.h"
 #include "mlx/graph_utils.h"

@@ -44,11 +47,8 @@ namespace detail {
 bool compile_available_for_device(const Device& device) {
  return true;
 }
-} // namespace detail

-std::string get_temp_file(const std::string& name) {
-  return std::filesystem::temp_directory_path().append(name).string();
-}
+} // namespace detail

 // Return a pointer to a compiled function
 void* compile(
@@ -68,24 +68,30 @@ void* compile(
  std::string source_code = source_builder();
  std::string kernel_file_name;

-  // Deal with long kernel names. Maximum length for files on macOS is 255
-  // characters. Clip file name with a little extra room and append a 16
-  // character hash.
+  // Deal with long kernel names. Maximum length for filename on macOS is 255
+  // characters, and on Windows the maximum length for whole path is 260. Clip
+  // file name with a little extra room and append a 16 character hash.
+#ifdef _WIN32
+  constexpr int max_file_name_length = 140;
+#else
  constexpr int max_file_name_length = 245;
+#endif
  if (kernel_name.size() > max_file_name_length) {
    std::ostringstream file_name;
    file_name
        << std::string_view(kernel_name).substr(0, max_file_name_length - 16);
-    auto file_id = std::hash<std::string>{}(kernel_name);
+    auto file_id =
+        std::hash<std::string>{}(kernel_name.substr(max_file_name_length - 16));
    file_name << "_" << std::hex << std::setw(16) << file_id << std::dec;
    kernel_file_name = file_name.str();
  } else {
    kernel_file_name = kernel_name;
  }

-  std::ostringstream shared_lib_name;
-  shared_lib_name << "lib" << kernel_file_name << ".so";
-  auto shared_lib_path = get_temp_file(shared_lib_name.str());
+  auto output_dir = std::filesystem::temp_directory_path();
+
+  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
+  auto shared_lib_path = (output_dir / shared_lib_name).string();
  bool lib_exists = false;
  {
    std::ifstream f(shared_lib_path.c_str());
@@ -94,24 +100,21 @@ void* compile(

  if (!lib_exists) {
    // Open source file and write source code to it
-    std::ostringstream source_file_name;
-    source_file_name << kernel_file_name << ".cpp";
-    auto source_file_path = get_temp_file(source_file_name.str());
+    std::string source_file_name = kernel_file_name + ".cpp";
+    auto source_file_path = (output_dir / source_file_name).string();

    std::ofstream source_file(source_file_path);
    source_file << source_code;
    source_file.close();

-    std::ostringstream build_command;
-    build_command << "g++ -std=c++17 -O3 -Wall -fPIC -shared '"
-                  << source_file_path << "' -o '" << shared_lib_path << "'";
-    std::string build_command_str = build_command.str();
-    auto return_code = system(build_command_str.c_str());
-    if (return_code) {
-      std::ostringstream msg;
-      msg << "[Compile::eval_cpu] Failed to compile function " << kernel_name
-          << " with error code " << return_code << "." << std::endl;
-      throw std::runtime_error(msg.str());
+    try {
+      JitCompiler::exec(JitCompiler::build_command(
+          output_dir, source_file_name, shared_lib_name));
+    } catch (const std::exception& error) {
+      throw std::runtime_error(fmt::format(
+          "[Compile::eval_cpu] Failed to compile function {0}: {1}",
+          kernel_name,
+          error.what()));
    }
  }

@@ -151,6 +154,11 @@ inline void build_kernel(

  NodeNamer namer;

+#ifdef _MSC_VER
+  // Export the symbol
+  os << "__declspec(dllexport) ";
+#endif
+
  // Start the kernel
  os << "void " << kernel_name << "(void** args) {" << std::endl;

--- a/mlx/backend/common/conv.cpp
+++ b/mlx/backend/common/conv.cpp
@@ -726,7 +726,7 @@ void explicit_gemm_conv_1D_cpu(
  auto conv_dtype = float32;

  // Pad input
-  std::vector<int> padded_shape = {N, iH + 2 * padding[0], C};
+  Shape padded_shape = {N, iH + 2 * padding[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -746,9 +746,9 @@ void explicit_gemm_conv_1D_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape = {N, oH, wH, C};
+  Shape strided_shape = {N, oH, wH, C};

-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[1],
@@ -765,7 +765,7 @@ void explicit_gemm_conv_1D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  std::vector<int> strided_reshape = {N * oH, wH * C};
+  Shape strided_reshape = {N * oH, wH * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -843,8 +843,7 @@ void explicit_gemm_conv_2D_cpu(
  auto conv_dtype = out.dtype();

  // Pad input
-  std::vector<int> padded_shape = {
-      N, iH + 2 * padding[0], iW + 2 * padding[1], C};
+  Shape padded_shape = {N, iH + 2 * padding[0], iW + 2 * padding[1], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
@@ -865,9 +864,9 @@ void explicit_gemm_conv_2D_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape = {N, oH, oW, wH, wW, C};
+  Shape strided_shape = {N, oH, oW, wH, wW, C};

-  std::vector<size_t> strided_strides = {
+  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[2] * wt_strides[1],
@@ -881,7 +880,7 @@ void explicit_gemm_conv_2D_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  std::vector<int> strided_reshape = {N * oH * oW, wH * wW * C};
+  Shape strided_reshape = {N * oH * oW, wH * wW * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy(in_strided_view, in_strided, CopyType::General);

@@ -934,19 +933,19 @@ void explicit_gemm_conv_ND_cpu(
    const std::vector<int>& wt_dilation,
    const bool flip) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
-  const auto iDim = std::vector<int>(
-      in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
-  const auto oDim = std::vector<int>(
+  const auto iDim =
+      Shape(in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
+  const auto oDim = Shape(
      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(-1); // In channels
-  const auto wDim = std::vector<int>(
-      wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim
+  const auto wDim =
+      Shape(wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim

  auto conv_dtype = float32;

  // Pad input
-  std::vector<int> padded_shape(in.shape().size());
+  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + 2 * padding[i];
@@ -974,7 +973,7 @@ void explicit_gemm_conv_ND_cpu(
  copy_inplace(in, in_padded_slice, CopyType::GeneralGeneral);

  // Make strided view
-  std::vector<int> strided_shape(oDim.size() + wDim.size() + 2);
+  Shape strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
  for (size_t i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
@@ -984,7 +983,7 @@ void explicit_gemm_conv_ND_cpu(
  }
  strided_shape.back() = C;

-  std::vector<size_t> strided_strides(in.shape().size() * 2 - 2);
+  Strides strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
  for (size_t i = 0; i < wt_strides.size(); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
@@ -1000,7 +999,7 @@ void explicit_gemm_conv_ND_cpu(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
-  std::vector<int> strided_reshape = {N, C};
+  Shape strided_reshape = {N, C};
  for (const auto& o : oDim) {
    strided_reshape[0] *= o;
  }
--- a/mlx/backend/common/copy.cpp
+++ b/mlx/backend/common/copy.cpp
@@ -26,13 +26,13 @@ void copy_vector(const array& src, array& dst) {
  std::copy(src_ptr, src_ptr + src.data_size(), dst_ptr);
 }

-template <typename SrcT, typename DstT, typename StrideT, int D>
+template <typename SrcT, typename DstT, int D>
 inline void copy_dims(
    const SrcT* src,
    DstT* dst,
-    const std::vector<int>& shape,
-    const std::vector<StrideT>& i_strides,
-    const std::vector<StrideT>& o_strides,
+    const Shape& shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int axis) {
  auto stride_src = i_strides[axis];
  auto stride_dst = o_strides[axis];
@@ -40,7 +40,7 @@ inline void copy_dims(

  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
-      copy_dims<SrcT, DstT, StrideT, D - 1>(
+      copy_dims<SrcT, DstT, D - 1>(
          src, dst, shape, i_strides, o_strides, axis + 1);
    } else {
      *dst = static_cast<DstT>(*src);
@@ -50,13 +50,13 @@ inline void copy_dims(
  }
 }

-template <typename SrcT, typename DstT, typename StrideT>
+template <typename SrcT, typename DstT>
 void copy_general_general(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<StrideT>& i_strides,
-    const std::vector<StrideT>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset) {
  if (data_shape.empty()) {
@@ -65,30 +65,30 @@ void copy_general_general(
    *dst_ptr = val;
    return;
  }
-  auto [shape, strides] = collapse_contiguous_dims(
-      data_shape, std::vector<std::vector<StrideT>>{i_strides, o_strides});
+  auto [shape, strides] =
+      collapse_contiguous_dims(data_shape, {i_strides, o_strides});
  auto src_ptr = src.data<SrcT>() + i_offset;
  auto dst_ptr = dst.data<DstT>() + o_offset;
  int ndim = shape.size();
  if (ndim == 1) {
-    copy_dims<SrcT, DstT, StrideT, 1>(
+    copy_dims<SrcT, DstT, 1>(
        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    return;
  } else if (ndim == 2) {
-    copy_dims<SrcT, DstT, StrideT, 2>(
+    copy_dims<SrcT, DstT, 2>(
        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    return;
  } else if (ndim == 3) {
-    copy_dims<SrcT, DstT, StrideT, 3>(
+    copy_dims<SrcT, DstT, 3>(
        src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    return;
  }
-  ContiguousIterator<StrideT> in(shape, strides[0], ndim - 3);
-  ContiguousIterator<StrideT> out(shape, strides[1], ndim - 3);
-  StrideT stride = std::accumulate(
-      shape.end() - 3, shape.end(), 1, std::multiplies<StrideT>());
-  for (StrideT elem = 0; elem < src.size(); elem += stride) {
-    copy_dims<SrcT, DstT, StrideT, 3>(
+  ContiguousIterator in(shape, strides[0], ndim - 3);
+  ContiguousIterator out(shape, strides[1], ndim - 3);
+  auto stride = std::accumulate(
+      shape.end() - 3, shape.end(), 1, std::multiplies<int64_t>());
+  for (int64_t elem = 0; elem < src.size(); elem += stride) {
+    copy_dims<SrcT, DstT, 3>(
        src_ptr + in.loc,
        dst_ptr + out.loc,
        shape,
@@ -102,37 +102,37 @@ void copy_general_general(

 template <typename SrcT, typename DstT>
 inline void copy_general_general(const array& src, array& dst) {
-  copy_general_general<SrcT, DstT, size_t>(
+  copy_general_general<SrcT, DstT>(
      src, dst, src.shape(), src.strides(), dst.strides(), 0, 0);
 }

-template <typename SrcT, typename DstT, typename StrideT>
+template <typename SrcT, typename DstT>
 void copy_general(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<StrideT>& i_strides,
-    const std::vector<StrideT>&,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides&,
    int64_t i_offset,
    int64_t o_offset) {
-  copy_general_general<SrcT, DstT, StrideT>(
+  copy_general_general<SrcT, DstT>(
      src,
      dst,
      data_shape,
      i_strides,
-      make_contiguous_strides<StrideT>(data_shape),
+      make_contiguous_strides(data_shape),
      i_offset,
      o_offset);
 }

 template <typename SrcT, typename DstT>
 inline void copy_general(const array& src, array& dst) {
-  copy_general_general<SrcT, DstT, size_t>(
+  copy_general_general<SrcT, DstT>(
      src,
      dst,
      src.shape(),
      src.strides(),
-      make_contiguous_strides<size_t>(src.shape()),
+      make_contiguous_strides(src.shape()),
      0,
      0);
 }
@@ -282,13 +282,12 @@ void copy(const array& src, array& dst, CopyType ctype) {
  copy_inplace(src, dst, ctype);
 }

-template <typename StrideT>
 void copy_inplace(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<StrideT>& i_strides,
-    const std::vector<StrideT>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype) {
@@ -311,24 +310,4 @@ void copy_inplace(
  }
 }

-template void copy_inplace<size_t>(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<size_t>& i_strides,
-    const std::vector<size_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
-template void copy_inplace<int64_t>(
-    const array& src,
-    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<int64_t>& i_strides,
-    const std::vector<int64_t>& o_strides,
-    int64_t i_offset,
-    int64_t o_offset,
-    CopyType ctype);
-
 } // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@@ -26,13 +26,12 @@ enum class CopyType {
 void copy(const array& src, array& dst, CopyType ctype);
 void copy_inplace(const array& src, array& dst, CopyType ctype);

-template <typename stride_t>
 void copy_inplace(
    const array& src,
    array& dst,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype);
--- a/mlx/backend/common/default_primitives.cpp
+++ b/mlx/backend/common/default_primitives.cpp
@@ -37,6 +37,7 @@ DEFAULT(ArgSort)
 DEFAULT(AsType)
 DEFAULT(AsStrided)
 DEFAULT(Broadcast)
+DEFAULT(BroadcastAxes)
 DEFAULT(BlockMaskedMM)
 DEFAULT(GatherMM)
 DEFAULT(GatherQMM)
@@ -57,6 +58,7 @@ DEFAULT(Equal)
 DEFAULT(Erf)
 DEFAULT(ErfInv)
 DEFAULT(Exp)
+DEFAULT(ExpandDims)
 DEFAULT(Expm1)
 DEFAULT(FFT)
 DEFAULT(Floor)
@@ -86,7 +88,6 @@ DEFAULT_MULTI(QRF)
 DEFAULT(QuantizedMatmul)
 DEFAULT(RandomBits)
 DEFAULT(Reduce)
-DEFAULT(Reshape)
 DEFAULT(Round)
 DEFAULT(Scan)
 DEFAULT(Scatter)
@@ -101,6 +102,7 @@ DEFAULT(Softmax)
 DEFAULT(Sort)
 DEFAULT_MULTI(Split)
 DEFAULT(Square)
+DEFAULT(Squeeze)
 DEFAULT(Sqrt)
 DEFAULT(StopGradient)
 DEFAULT(Subtract)
@@ -130,7 +132,7 @@ inline void matmul_common_general(
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
-      size_t stx = arr.shape(-1);
+      stx = arr.shape(-1);
      return std::make_tuple(false, stx, arr_copy);
    }
  };
--- a/mlx/backend/common/indexing.cpp
+++ b/mlx/backend/common/indexing.cpp
@@ -32,7 +32,7 @@ void gather(
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
-    const std::vector<int>& slice_sizes) {
+    const Shape& slice_sizes) {
  // If the array is row contiguous then we can do a contiguous copy given
  // two conditions on the slice size:
  // - Any number of leading ones in the slice sizes are allowed
@@ -80,11 +80,10 @@ void gather(
  T* dst_ptr = out.data<T>();
  size_t out_idx = 0;

-  std::vector<ContiguousIterator<size_t>> its(inds.begin(), inds.end());
-  ContiguousIterator<size_t> src_it;
+  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
+  ContiguousIterator src_it;
  if (!can_copy && src.ndim() > 0) {
-    src_it = std::move(
-        ContiguousIterator<size_t>(slice_sizes, src.strides(), src.ndim()));
+    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }
  for (int idx = 0; idx < ind_size; idx++) {
    size_t src_idx = 0;
@@ -119,7 +118,7 @@ void dispatch_gather(
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
-    const std::vector<int>& size) {
+    const Shape& size) {
  switch (out.dtype()) {
    case bool_:
      gather<bool, IdxT>(src, inds, out, axes, size);
@@ -223,16 +222,16 @@ void scatter(
  auto inds_ndim = updates.ndim() - out.ndim();
  size_t n_updates = nind ? inds[0].size() : 1;

-  std::vector<int> update_shape(
+  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
  size_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }

-  std::vector<ContiguousIterator<size_t>> its(inds.begin(), inds.end());
-  ContiguousIterator<size_t> update_it(updates);
-  ContiguousIterator<size_t> out_it(update_shape, out.strides(), out.ndim());
+  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
+  ContiguousIterator update_it(updates);
+  ContiguousIterator out_it(update_shape, out.strides(), out.ndim());

  for (int i = 0; i < n_updates; ++i) {
    size_t out_offset = 0;
--- a/mlx/backend/common/jit_compiler.cpp
+++ b/mlx/backend/common/jit_compiler.cpp
@@ -0,0 +1,152 @@
+// Copyright © 2024 Apple Inc.
+
+#include "mlx/backend/common/jit_compiler.h"
+
+#include <sstream>
+#include <vector>
+
+#include <fmt/format.h>
+
+namespace mlx::core {
+
+#ifdef _MSC_VER
+
+namespace {
+
+// Split string into array.
+std::vector<std::string> str_split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+// Get path information about MSVC.
+struct VisualStudioInfo {
+  VisualStudioInfo() {
+#ifdef _M_ARM64
+    arch = "arm64";
+#else
+    arch = "x64";
+#endif
+    // Get path of Visual Studio.
+    std::string vs_path = JitCompiler::exec(fmt::format(
+        "\"{0}\\Microsoft Visual Studio\\Installer\\vswhere.exe\""
+        " -property installationPath",
+        std::getenv("ProgramFiles(x86)")));
+    if (vs_path.empty()) {
+      throw std::runtime_error("Can not find Visual Studio.");
+    }
+    // Read the envs from vcvarsall.
+    std::string envs = JitCompiler::exec(fmt::format(
+        "\"{0}\\VC\\Auxiliary\\Build\\vcvarsall.bat\" {1} >NUL && set",
+        vs_path,
+        arch));
+    for (const std::string& line : str_split(envs, '\n')) {
+      // Each line is in the format "ENV_NAME=values".
+      auto pos = line.find_first_of('=');
+      if (pos == std::string::npos || pos == 0 || pos == line.size() - 1)
+        continue;
+      std::string name = line.substr(0, pos);
+      std::string value = line.substr(pos + 1);
+      if (name == "LIB") {
+        libpaths = str_split(value, ';');
+      } else if (name == "VCToolsInstallDir") {
+        cl_exe = fmt::format("{0}\\bin\\Host{1}\\{1}\\cl.exe", value, arch);
+      }
+    }
+  }
+  std::string arch;
+  std::string cl_exe;
+  std::vector<std::string> libpaths;
+};
+
+const VisualStudioInfo& GetVisualStudioInfo() {
+  static VisualStudioInfo info;
+  return info;
+}
+
+} // namespace
+
+#endif // _MSC_VER
+
+std::string JitCompiler::build_command(
+    const std::filesystem::path& dir,
+    const std::string& source_file_name,
+    const std::string& shared_lib_name) {
+#ifdef _MSC_VER
+  const VisualStudioInfo& info = GetVisualStudioInfo();
+  std::string libpaths;
+  for (const std::string& lib : info.libpaths) {
+    libpaths += fmt::format(" /libpath:\"{0}\"", lib);
+  }
+  return fmt::format(
+      "\""
+      "cd /D \"{0}\" && "
+      "\"{1}\" /LD /EHsc /MD /Ox /nologo /std:c++17 \"{2}\" "
+      "/link /out:\"{3}\" {4} 2>&1"
+      "\"",
+      dir.string(),
+      info.cl_exe,
+      source_file_name,
+      shared_lib_name,
+      libpaths);
+#else
+  return fmt::format(
+      "g++ -std=c++17 -O3 -Wall -fPIC -shared '{0}' -o '{1}' 2>&1",
+      (dir / source_file_name).string(),
+      (dir / shared_lib_name).string());
+#endif
+}
+
+std::string JitCompiler::exec(const std::string& cmd) {
+#ifdef _MSC_VER
+  FILE* pipe = _popen(cmd.c_str(), "r");
+#else
+  FILE* pipe = popen(cmd.c_str(), "r");
+#endif
+  if (!pipe) {
+    throw std::runtime_error("popen() failed.");
+  }
+  char buffer[128];
+  std::string ret;
+  while (fgets(buffer, sizeof(buffer), pipe)) {
+    ret += buffer;
+  }
+  // Trim trailing spaces.
+  ret.erase(
+      std::find_if(
+          ret.rbegin(),
+          ret.rend(),
+          [](unsigned char ch) { return !std::isspace(ch); })
+          .base(),
+      ret.end());
+
+#ifdef _MSC_VER
+  int status = _pclose(pipe);
+#else
+  int status = pclose(pipe);
+#endif
+  if (status == -1) {
+    throw std::runtime_error("pclose() failed.");
+  }
+#ifdef _MSC_VER
+  int code = status;
+#else
+  int code = WEXITSTATUS(status);
+#endif
+  if (code != 0) {
+    throw std::runtime_error(fmt::format(
+        "Failed to execute command with return code {0}: \"{1}\", "
+        "the output is: {2}",
+        code,
+        cmd,
+        ret));
+  }
+  return ret;
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/jit_compiler.h
+++ b/mlx/backend/common/jit_compiler.h
@@ -0,0 +1,20 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+
+#include <filesystem>
+
+namespace mlx::core {
+
+class JitCompiler {
+ public:
+  // Build a shell command that compiles a source code file to a shared library.
+  static std::string build_command(
+      const std::filesystem::path& dir,
+      const std::string& source_file_name,
+      const std::string& shared_lib_name);
+
+  // Run a command and get its output.
+  static std::string exec(const std::string& cmd);
+};
+
+} // namespace mlx::core
--- a/mlx/backend/common/lapack.h
+++ b/mlx/backend/common/lapack.h
@@ -2,6 +2,15 @@

 #pragma once

+// Required for Visual Studio.
+// https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md
+#ifdef _MSC_VER
+#include <complex>
+#define LAPACK_COMPLEX_CUSTOM
+#define lapack_complex_float std::complex<float>
+#define lapack_complex_double std::complex<double>
+#endif
+
 #ifdef ACCELERATE_NEW_LAPACK
 #include <Accelerate/Accelerate.h>
 #else
--- a/mlx/backend/common/make_compiled_preamble.ps1
+++ b/mlx/backend/common/make_compiled_preamble.ps1
@@ -0,0 +1,38 @@
+# This script generates a C++ function that provides the CPU
+# code for use with kernel generation.
+#
+# Copyright © 2024 Apple Inc.
+
+$OUTPUT_FILE = $args[0]
+$CL = $args[1]
+$SRCDIR = $args[2]
+
+# Get command result as array.
+$CONTENT = & $CL /std:c++17 /EP "/I$SRCDIR" /Tp "$SRCDIR/mlx/backend/common/compiled_preamble.h"
+# Remove empty lines.
+# Otherwise there will be too much empty lines making the result unreadable.
+$CONTENT = $CONTENT | Where-Object { $_.Trim() -ne '' }
+# Concatenate to string.
+$CONTENT = $CONTENT -join "`n"
+
+# Append extra content.
+$CONTENT = @"
+$($CONTENT)
+using namespace mlx::core;
+using namespace mlx::core::detail;
+"@
+
+# Convert each char to ASCII code.
+# Unlike the unix script that outputs string literal directly, the output from
+# MSVC is way too large to be embedded as string and compilation will fail, so
+# we store it as static array instead.
+$CHARCODES = ([System.Text.Encoding]::ASCII.GetBytes($CONTENT) -join ', ') + ', 0'
+
+$OUTPUT = @"
+const char* get_kernel_preamble() {
+  static char preamble[] = { $CHARCODES };
+  return preamble;
+}
+"@
+
+Set-Content -Path $OUTPUT_FILE -Value $OUTPUT
--- a/mlx/backend/common/make_compiled_preamble.sh
+++ b/mlx/backend/common/make_compiled_preamble.sh
@@ -10,15 +10,16 @@ OUTPUT_FILE=$1
 GCC=$2
 SRCDIR=$3
 CLANG=$4
+ARCH=$5

 if [ "$CLANG" = "TRUE" ]; then
  read -r -d '' INCLUDES <<- EOM
-  #include <cmath>
-  #include <complex>
-  #include <cstdint>
-  #include <vector>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <vector>
 EOM
-CC_FLAGS=""
+CC_FLAGS="-arch ${ARCH}"
 else
 CC_FLAGS="-std=c++17"
 fi
--- a/mlx/backend/common/masked_mm.cpp
+++ b/mlx/backend/common/masked_mm.cpp
@@ -19,10 +19,10 @@ inline void mask_matrix(
    int block_size,
    const int X,
    const int Y,
-    const size_t X_data_str,
-    const size_t Y_data_str,
-    const size_t X_mask_str,
-    const size_t Y_mask_str,
+    const int64_t X_data_str,
+    const int64_t Y_data_str,
+    const int64_t X_mask_str,
+    const int64_t Y_mask_str,
    const size_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;
@@ -84,7 +84,7 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
        } else {
          array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
          copy(arr, arr_copy, CopyType::General);
-          size_t stx = arr.shape(-1);
+          int64_t stx = arr.shape(-1);
          return std::make_tuple(false, stx, arr_copy);
        }
      };
@@ -117,13 +117,13 @@ void BlockMaskedMM::eval(const std::vector<array>& inputs, array& out) {
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str) {
-    size_t mask_offset = elem_to_loc(
+    auto mask_offset = elem_to_loc(
        mask.shape(-1) * mask.shape(-2) * batch_idx,
        mask.shape(),
        mask.strides());

-    size_t X_mask_str = mask.strides()[mask.ndim() - 2];
-    size_t Y_mask_str = mask.strides()[mask.ndim() - 1];
+    auto X_mask_str = mask.strides()[mask.ndim() - 2];
+    auto Y_mask_str = mask.strides()[mask.ndim() - 1];

    if (mask.dtype() == bool_) {
      return mask_matrix(
@@ -230,7 +230,7 @@ void GatherMM::eval(const std::vector<array>& inputs, array& out) {
    } else {
      array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
      copy(arr, arr_copy, CopyType::General);
-      size_t stx = arr.shape(-1);
+      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, arr_copy);
    }
  };
@@ -262,13 +262,13 @@ void GatherMM::eval(const std::vector<array>& inputs, array& out) {
  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];

-  std::vector<int> batch_shape = get_batch_dims(out.shape());
+  auto batch_shape = get_batch_dims(out.shape());
  int batch_ndim = batch_shape.size();

-  std::vector<int> batch_shape_A = get_batch_dims(a.shape());
-  std::vector<size_t> batch_strides_A = get_batch_dims(a.strides());
-  std::vector<int> batch_shape_B = get_batch_dims(b.shape());
-  std::vector<size_t> batch_strides_B = get_batch_dims(b.strides());
+  auto batch_shape_A = get_batch_dims(a.shape());
+  auto batch_strides_A = get_batch_dims(a.strides());
+  auto batch_shape_B = get_batch_dims(b.shape());
+  auto batch_strides_B = get_batch_dims(b.strides());

  const uint32_t* lhs_indices_ptr = lhs_indices.data<uint32_t>();
  const uint32_t* rhs_indices_ptr = rhs_indices.data<uint32_t>();
--- a/mlx/backend/common/ops.h
+++ b/mlx/backend/common/ops.h
@@ -500,7 +500,12 @@ struct Equal {
 struct NaNEqual {
  template <typename T>
  bool operator()(T x, T y) {
-    return x == y || (std::isnan(x) && std::isnan(y));
+    if constexpr (std::is_integral_v<T>) {
+      // isnan always returns false for integers, and MSVC refuses to compile.
+      return x == y;
+    } else {
+      return x == y || (std::isnan(x) && std::isnan(y));
+    }
  }
 };

--- a/mlx/backend/common/primitives.cpp
+++ b/mlx/backend/common/primitives.cpp
@@ -19,6 +19,45 @@

 namespace mlx::core {

+void reshape(const array& in, array& out) {
+  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
+  if (copy_necessary) {
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    copy_inplace(in, out, CopyType::General);
+  } else {
+    shared_buffer_reshape(in, out_strides, out);
+  }
+}
+
+int64_t compute_dynamic_offset(
+    const array& indices,
+    const Strides& strides,
+    const std::vector<int>& axes) {
+  auto compute_offset = [&strides, &axes](const auto* indices) {
+    int64_t offset = 0;
+    for (int i = 0; i < axes.size(); ++i) {
+      offset += indices[i] * strides[axes[i]];
+    }
+    return offset;
+  };
+  switch (indices.dtype()) {
+    case int8:
+    case uint8:
+      return compute_offset(indices.data<uint8_t>());
+    case int16:
+    case uint16:
+      return compute_offset(indices.data<uint16_t>());
+    case int32:
+    case uint32:
+      return compute_offset(indices.data<uint32_t>());
+    case int64:
+    case uint64:
+      return compute_offset(indices.data<uint64_t>());
+    default:
+      throw std::runtime_error("Invalid indices type.");
+  }
+}
+
 void Abs::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -258,6 +297,14 @@ void Expm1::eval(const std::vector<array>& inputs, array& out) {
  }
 }

+void Flatten::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
+}
+
+void Unflatten::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
+}
+
 void Floor::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
@@ -417,18 +464,8 @@ void Real::eval_cpu(const std::vector<array>& inputs, array& out) {
  unary_op<complex64_t, float>(inputs[0], out, detail::Real());
 }

-void Reshape::eval(const std::vector<array>& inputs, array& out) {
-  assert(inputs.size() == 1);
-  const auto& in = inputs[0];
-
-  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
-
-  if (copy_necessary) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    copy_inplace(in, out, CopyType::General);
-  } else {
-    shared_buffer_reshape(in, out_strides, out);
-  }
+void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
+  reshape(inputs[0], out);
 }

 void Round::eval(const std::vector<array>& inputs, array& out) {
@@ -498,34 +535,65 @@ void Slice::eval(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [copy_needed, data_offset, inp_strides] =
-      prepare_slice(in, start_indices_, strides_);
-
-  // Do copy if needed
-  if (copy_needed) {
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
-    std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
-    copy_inplace<int64_t>(
-        /* const array& src = */ in,
-        /* array& dst = */ out,
-        /* const std::vector<int>& data_shape = */ out.shape(),
-        /* const std::vector<stride_t>& i_strides = */ inp_strides,
-        /* const std::vector<stride_t>& o_strides = */ ostrides,
-        /* int64_t i_offset = */ data_offset,
-        /* int64_t o_offset = */ 0,
-        /* CopyType ctype = */ CopyType::General);
-  } else {
-    size_t data_end = 1;
-    for (int i = 0; i < end_indices_.size(); ++i) {
-      if (in.shape()[i] > 1) {
-        auto end_idx = start_indices_[i] + out.shape()[i] * strides_[i] - 1;
-        data_end += end_idx * in.strides()[i];
-      }
+  auto [data_offset, inp_strides] = prepare_slice(in, start_indices_, strides_);
+  size_t data_end = 1;
+  for (int i = 0; i < end_indices_.size(); ++i) {
+    if (in.shape()[i] > 1) {
+      auto end_idx = start_indices_[i] + out.shape()[i] * strides_[i] - 1;
+      data_end += end_idx * in.strides()[i];
    }
-    size_t data_size = data_end - data_offset;
-    std::vector<size_t> ostrides{inp_strides.begin(), inp_strides.end()};
-    shared_buffer_slice(in, ostrides, data_offset, data_size, out);
  }
+  size_t data_size = data_end - data_offset;
+  Strides ostrides{inp_strides.begin(), inp_strides.end()};
+  shared_buffer_slice(in, ostrides, data_offset, data_size, out);
+}
+
+void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+  auto& in = inputs[0];
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+  auto i_offset = compute_dynamic_offset(inputs[1], in.strides(), axes_);
+  copy_inplace(
+      /* const array& src = */ in,
+      /* array& dst = */ out,
+      /* const Shape& data_shape = */ out.shape(),
+      /* const Strides& i_strides = */ in.strides(),
+      /* const Strides& o_strides = */ out.strides(),
+      /* int64_t i_offset = */ i_offset,
+      /* int64_t o_offset = */ 0,
+      /* CopyType ctype = */ CopyType::GeneralGeneral);
+}
+
+void DynamicSliceUpdate::eval_cpu(
+    const std::vector<array>& inputs,
+    array& out) {
+  if (out.size() == 0) {
+    out.set_data(nullptr);
+    return;
+  }
+
+  auto& in = inputs[0];
+  auto& upd = inputs[1];
+
+  // Copy or move src to dst
+  auto ctype = in.flags().contiguous && in.size() == in.data_size()
+      ? CopyType::Vector
+      : CopyType::General;
+  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);
+
+  auto o_offset = compute_dynamic_offset(inputs[2], out.strides(), axes_);
+  copy_inplace(
+      /* const array& src = */ upd,
+      /* array& dst = */ out,
+      /* const std::vector<int>& data_shape = */ upd.shape(),
+      /* const std::vector<stride_t>& i_strides = */ upd.strides(),
+      /* const std::vector<stride_t>& o_strides = */ out.strides(),
+      /* int64_t i_offset = */ 0,
+      /* int64_t o_offset = */ o_offset,
+      /* CopyType ctype = */ CopyType::GeneralGeneral);
 }

 void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
@@ -550,15 +618,14 @@ void SliceUpdate::eval(const std::vector<array>& inputs, array& out) {
  copy(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype);

  // Calculate out strides, initial offset and if copy needs to be made
-  auto [data_offset, out_strides] = prepare_slice(out);
+  auto [data_offset, out_strides] = prepare_slice(in, start_indices_, strides_);

  // Do copy
-  std::vector<int64_t> upd_strides{upd.strides().begin(), upd.strides().end()};
-  copy_inplace<int64_t>(
+  copy_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
-      /* const std::vector<stride_t>& i_strides = */ upd_strides,
+      /* const std::vector<stride_t>& i_strides = */ upd.strides(),
      /* const std::vector<stride_t>& o_strides = */ out_strides,
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ data_offset,
@@ -614,7 +681,7 @@ void View::eval_cpu(const std::vector<array>& inputs, array& out) {
  // - type size is the same
  // - type size is smaller and the last axis is contiguous
  // - the entire array is row contiguous
-  if (ibytes == obytes || obytes < ibytes && in.strides().back() == 1 ||
+  if (ibytes == obytes || (obytes < ibytes && in.strides().back() == 1) ||
      in.flags().row_contiguous) {
    auto strides = in.strides();
    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
--- a/mlx/backend/common/qrf.cpp
+++ b/mlx/backend/common/qrf.cpp
@@ -54,7 +54,7 @@ void qrf_impl(const array& a, array& q, array& r) {
  // Copy the input to be column contiguous
  flags.col_contiguous = num_matrices == 1;
  flags.row_contiguous = false;
-  std::vector<size_t> strides = in.strides();
+  auto strides = in.strides();
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(
--- a/mlx/backend/common/reduce.cpp
+++ b/mlx/backend/common/reduce.cpp
@@ -174,19 +174,19 @@ void reduce_dispatch_min_max(

 void nd_loop(
    std::function<void(int)> callback,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides) {
+    const Shape& shape,
+    const Strides& strides) {
  std::function<void(int, int)> loop_inner;
  loop_inner = [&](int dim, int offset) {
    if (dim < shape.size() - 1) {
-      int size = shape[dim];
-      size_t stride = strides[dim];
+      auto size = shape[dim];
+      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        loop_inner(dim + 1, offset + i * stride);
      }
    } else {
-      int size = shape[dim];
-      size_t stride = strides[dim];
+      auto size = shape[dim];
+      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        callback(offset + i * stride);
      }
--- a/mlx/backend/common/reduce.h
+++ b/mlx/backend/common/reduce.h
@@ -38,13 +38,10 @@ enum ReductionOpType {

 struct ReductionPlan {
  ReductionOpType type;
-  std::vector<int> shape;
-  std::vector<size_t> strides;
+  Shape shape;
+  Strides strides;

-  ReductionPlan(
-      ReductionOpType type_,
-      std::vector<int> shape_,
-      std::vector<size_t> strides_)
+  ReductionPlan(ReductionOpType type_, Shape shape_, Strides strides_)
      : type(type_), shape(std::move(shape_)), strides(std::move(strides_)) {}
  ReductionPlan(ReductionOpType type_) : type(type_) {}
 };
@@ -55,10 +52,10 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);
 // Should this be in utils?
 void nd_loop(
    std::function<void(int)> callback,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides);
+    const Shape& shape,
+    const Strides& strides);

-std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
+std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);

@@ -113,9 +110,6 @@ void reduction_op(
    return;
  }

-  std::vector<int> shape;
-  std::vector<size_t> strides;
-
  if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
    int reduction_size = plan.shape[0];
    const T* x_ptr = x.data<T>();
@@ -135,7 +129,7 @@ void reduction_op(
    U* out_ptr = out.data<U>();
    // Unrolling the following loop (and implementing it in order for
    // ContiguousReduce) should hold extra performance boost.
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i++, out_ptr++) {
        int offset = elem_to_loc(i, shape, strides);
@@ -181,7 +175,7 @@ void reduction_op(
    plan.strides.pop_back();
    const T* x_ptr = x.data<T>();
    U* out_ptr = out.data<U>();
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i += reduction_stride) {
        int offset = elem_to_loc(i, shape, strides);
@@ -211,7 +205,7 @@ void reduction_op(
  if (plan.type == GeneralReduce) {
    const T* x_ptr = x.data<T>();
    U* out_ptr = out.data<U>();
-    std::tie(shape, strides) = shapes_without_reduction_axes(x, axes);
+    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    for (int i = 0; i < out.size(); i++, out_ptr++) {
      int offset = elem_to_loc(i, shape, strides);
      U val = init;
--- a/mlx/backend/common/reduce_utils.cpp
+++ b/mlx/backend/common/reduce_utils.cpp
@@ -4,11 +4,11 @@

 namespace mlx::core {

-std::pair<std::vector<int>, std::vector<size_t>> shapes_without_reduction_axes(
+std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes) {
-  std::vector<int> shape = x.shape();
-  std::vector<size_t> strides = x.strides();
+  auto shape = x.shape();
+  auto strides = x.strides();

  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
@@ -29,8 +29,8 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // Row contiguous input so the output is row contiguous
  if (x.flags().row_contiguous) {
    // Merge consecutive axes
-    std::vector<int> shape = {x.shape(axes[0])};
-    std::vector<size_t> strides = {x.strides()[axes[0]]};
+    Shape shape = {x.shape(axes[0])};
+    Strides strides = {x.strides()[axes[0]]};
    for (int i = 1; i < axes.size(); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
@@ -69,7 +69,7 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {

  // Sort reduction axes by stride in order to merge them and figure out if we
  // have a contiguous reduction.
-  std::vector<std::pair<int, size_t>> reductions;
+  std::vector<std::pair<int, int64_t>> reductions;
  for (auto a : axes) {
    if (x.shape(a) > 1) {
      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
@@ -93,8 +93,8 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
    }
  }

-  std::vector<int> shape;
-  std::vector<size_t> strides;
+  Shape shape;
+  Strides strides;
  for (auto r : reductions) {
    shape.push_back(r.first);
    strides.push_back(r.second);
@@ -109,15 +109,15 @@ ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // Delegate to the general strided reduction op if the axes after
  // strides.back() are contiguous.
  if (strides.back() > 1) {
-    int size = 1;
+    int64_t size = 1;
    bool have_expand = false;
    for (int i = x.ndim() - 1; i >= 0; i--) {
      if (axes.back() == i) {
        continue;
      }

-      size_t stride_i = x.strides()[i];
-      int shape_i = x.shape(i);
+      auto stride_i = x.strides()[i];
+      auto shape_i = x.shape(i);
      if (stride_i == 0) {
        if (shape_i == 1) {
          continue;
--- a/mlx/backend/common/slicing.cpp
+++ b/mlx/backend/common/slicing.cpp
@@ -4,24 +4,22 @@

 namespace mlx::core {

-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
-    const std::vector<int>& start_indices,
-    const std::vector<int>& strides) {
+    const Shape& start_indices,
+    const Shape& strides) {
  int64_t data_offset = 0;
-  bool copy_needed = false;
-  std::vector<int64_t> inp_strides(in.ndim(), 0);
+  Strides inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
-    copy_needed |= strides[i] < 0;
  }
-  return std::make_tuple(copy_needed, data_offset, inp_strides);
+  return std::make_tuple(data_offset, inp_strides);
 }

 void shared_buffer_slice(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    size_t data_offset,
    size_t data_size,
    array& out) {
--- a/mlx/backend/common/slicing.h
+++ b/mlx/backend/common/slicing.h
@@ -6,14 +6,14 @@

 namespace mlx::core {

-std::tuple<bool, int64_t, std::vector<int64_t>> prepare_slice(
+std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
-    const std::vector<int>& start_indices,
-    const std::vector<int>& strides);
+    const Shape& start_indices,
+    const Shape& strides);

 void shared_buffer_slice(
    const array& in,
-    const std::vector<size_t>& out_strides,
+    const Strides& out_strides,
    size_t data_offset,
    size_t data_size,
    array& out);
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@@ -14,10 +14,10 @@ namespace mlx::core {

 namespace {

-template <typename T, typename IdxT = int32_t>
+template <typename T>
 struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
-  using difference_type = IdxT;
+  using difference_type = int32_t;
  using value_type = T;
  using reference = value_type&;
  using pointer = value_type*;
@@ -25,7 +25,7 @@ struct StridedIterator {
  // Constructors
  StridedIterator() = default;

-  explicit StridedIterator(T* ptr, size_t stride, difference_type offset = 0)
+  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
      : ptr_(ptr + offset * stride), stride_(stride) {}

  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
@@ -99,7 +99,7 @@ struct StridedIterator {
  }

 private:
-  size_t stride_;
+  int64_t stride_;
  T* ptr_;
 };

@@ -120,11 +120,11 @@ void sort(const array& in, array& out, int axis) {
  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = out.strides()[axis];
-  int axis_size = out.shape(axis);
+  auto axis_stride = out.strides()[axis];
+  auto axis_size = out.shape(axis);

  // Perform sorting in place
-  ContiguousIterator<size_t> src_it(
+  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
    T* data_ptr = out.data<T>() + src_it.loc;
@@ -158,14 +158,14 @@ void argsort(const array& in, array& out, int axis) {
  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);

-  size_t in_stride = in.strides()[axis];
-  size_t out_stride = out.strides()[axis];
-  int axis_size = in.shape(axis);
+  auto in_stride = in.strides()[axis];
+  auto out_stride = out.strides()[axis];
+  auto axis_size = in.shape(axis);

  // Perform sorting
-  ContiguousIterator<size_t> in_it(
+  ContiguousIterator in_it(
      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
-  ContiguousIterator<size_t> out_it(
+  ContiguousIterator out_it(
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
    const T* data_ptr = in.data<T>() + in_it.loc;
@@ -208,13 +208,13 @@ void partition(const array& in, array& out, int axis, int kth) {
  auto remaining_strides = in.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

-  size_t axis_stride = in.strides()[axis];
+  auto axis_stride = in.strides()[axis];
  int axis_size = in.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition in place
-  ContiguousIterator<size_t> src_it(
+  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
    T* data_ptr = out.data<T>() + src_it.loc;
@@ -249,16 +249,16 @@ void argpartition(const array& in, array& out, int axis, int kth) {
  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);

-  size_t in_stride = in.strides()[axis];
-  size_t out_stride = out.strides()[axis];
-  int axis_size = in.shape(axis);
+  auto in_stride = in.strides()[axis];
+  auto out_stride = out.strides()[axis];
+  auto axis_size = in.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition
-  ContiguousIterator<size_t> in_it(
+  ContiguousIterator in_it(
      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
-  ContiguousIterator<size_t> out_it(
+  ContiguousIterator out_it(
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  for (int i = 0; i < n_rows; i++) {
    const T* data_ptr = in.data<T>() + in_it.loc;
--- a/mlx/backend/common/ternary.h
+++ b/mlx/backend/common/ternary.h
@@ -67,7 +67,12 @@ void set_ternary_op_output_data(
      }
      break;
    case TernaryOpType::General:
-      out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      // Try to donate an input which is row_contiguous
+      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
+            (b.flags().row_contiguous && maybe_donate(b)) ||
+            (c.flags().row_contiguous && maybe_donate(c)))) {
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+      }
      break;
  }
 }
@@ -78,11 +83,11 @@ void ternary_op_dims(
    const T3* c,
    U* out,
    Op op,
-    const std::vector<int>& shape,
-    const std::vector<size_t>& a_strides,
-    const std::vector<size_t>& b_strides,
-    const std::vector<size_t>& c_strides,
-    const std::vector<size_t>& out_strides,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& c_strides,
+    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
@@ -164,10 +169,10 @@ void ternary_op_dispatch_dims(
      return;
  }

-  ContiguousIterator<size_t> a_it(shape, a_strides, ndim - 2);
-  ContiguousIterator<size_t> b_it(shape, b_strides, ndim - 2);
-  ContiguousIterator<size_t> c_it(shape, c_strides, ndim - 2);
-  size_t stride = out_strides[ndim - 3];
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  ContiguousIterator c_it(shape, c_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
  for (size_t elem = 0; elem < a.size(); elem += stride) {
    ternary_op_dims<T1, T2, T3, U, Op, 2>(
        a_ptr + a_it.loc,
--- a/mlx/backend/common/utils.cpp
+++ b/mlx/backend/common/utils.cpp
@@ -15,7 +15,7 @@ void move_or_copy(const array& in, array& out) {
 void move_or_copy(
    const array& in,
    array& out,
-    const std::vector<size_t>& strides,
+    const Strides& strides,
    array::Flags flags,
    size_t data_size,
    size_t offset /* = 0 */) {
@@ -26,15 +26,13 @@ void move_or_copy(
  }
 }

-template <typename StrideT>
-std::tuple<std::vector<int>, std::vector<std::vector<StrideT>>>
-collapse_contiguous_dims_impl(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<StrideT>>& strides,
-    StrideT size_cap) {
+std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
+    const Shape& shape,
+    const std::vector<Strides>& strides,
+    int64_t size_cap) {
  // Make a vector that has axes separated with -1. Collapse all axes between
  // -1.
-  std::vector<int> to_collapse;
+  Shape to_collapse;
  if (shape.size() > 0) {
    if (shape[0] != 1) {
      to_collapse.push_back(0);
@@ -43,7 +41,7 @@ collapse_contiguous_dims_impl(
    for (int i = 1; i < shape.size(); i++) {
      bool contiguous = true;
      size *= shape[i];
-      for (const std::vector<StrideT>& st : strides) {
+      for (const auto& st : strides) {
        if (st[i] * shape[i] != st[i - 1] || size > size_cap) {
          contiguous = false;
          size = shape[i];
@@ -60,8 +58,8 @@ collapse_contiguous_dims_impl(
    to_collapse.push_back(-1);
  }

-  std::vector<int> out_shape;
-  std::vector<std::vector<StrideT>> out_strides(strides.size());
+  Shape out_shape;
+  std::vector<Strides> out_strides(strides.size());
  for (int i = 0;;) {
    while (i < to_collapse.size() && to_collapse[i] == -1) {
      ++i;
@@ -76,7 +74,7 @@ collapse_contiguous_dims_impl(
    }
    out_shape.push_back(current_shape);
    for (int j = 0; j < strides.size(); j++) {
-      const std::vector<StrideT>& st = strides[j];
+      const auto& st = strides[j];
      out_strides[j].push_back(st[to_collapse[k - 1]]);
    }
    i = k + 1;
@@ -91,29 +89,12 @@ collapse_contiguous_dims_impl(
  return std::make_tuple(out_shape, out_strides);
 }

-std::tuple<std::vector<int>, std::vector<std::vector<int64_t>>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<int64_t>>& strides,
-    int64_t size_cap /* = std::numeric_limits<int32_t>::max() */) {
-  return collapse_contiguous_dims_impl(shape, strides, size_cap);
-}
-
-std::tuple<std::vector<int>, std::vector<std::vector<size_t>>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<size_t>>& strides,
-    size_t size_cap /* = std::numeric_limits<int32>::max() */) {
-  return collapse_contiguous_dims_impl(shape, strides, size_cap);
-}
-
-template <typename StrideT>
-std::pair<std::vector<int>, std::vector<StrideT>> collapse_contiguous_dims_impl(
-    const std::vector<int>& shape,
-    const std::vector<StrideT>& strides,
-    StrideT size_cap) {
-  std::vector<int> collapsed_shape;
-  std::vector<StrideT> collapsed_strides;
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const Shape& shape,
+    const Strides& strides,
+    int64_t size_cap) {
+  Shape collapsed_shape;
+  Strides collapsed_strides;

  if (shape.size() > 0) {
    collapsed_shape.push_back(shape[0]);
@@ -123,7 +104,7 @@ std::pair<std::vector<int>, std::vector<StrideT>> collapse_contiguous_dims_impl(
        continue;
      } else if (
          strides[i] * shape[i] != collapsed_strides.back() ||
-          collapsed_shape.back() * static_cast<StrideT>(shape[i]) > size_cap) {
+          collapsed_shape.back() * static_cast<int64_t>(shape[i]) > size_cap) {
        collapsed_shape.push_back(shape[i]);
        collapsed_strides.push_back(strides[i]);
      } else {
@@ -136,25 +117,10 @@ std::pair<std::vector<int>, std::vector<StrideT>> collapse_contiguous_dims_impl(
  return std::make_pair(collapsed_shape, collapsed_strides);
 }

-std::pair<std::vector<int>, std::vector<int64_t>> collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<int64_t>& strides,
-    int64_t size_cap /* = std::numeric_limits<int32_t>::max() */) {
-  return collapse_contiguous_dims_impl<int64_t>(shape, strides, size_cap);
-}
-
-std::pair<std::vector<int>, std::vector<size_t>> collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides,
-    size_t size_cap /* = std::numeric_limits<int32_t>::max() */) {
-  return collapse_contiguous_dims_impl<size_t>(shape, strides, size_cap);
-}
-
-std::pair<std::vector<int>, std::vector<size_t>> collapse_contiguous_dims(
+std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
-    size_t size_cap /* = std::numeric_limits<int32_t>::max()*/) {
-  return collapse_contiguous_dims_impl<size_t>(
-      a.shape(), a.strides(), size_cap);
+    int64_t size_cap /* = std::numeric_limits<int32_t>::max()*/) {
+  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
 }

 } // namespace mlx::core
--- a/mlx/backend/common/utils.h
+++ b/mlx/backend/common/utils.h
@@ -8,12 +8,9 @@

 namespace mlx::core {

-template <typename StrideT>
-inline StrideT elem_to_loc(
-    int elem,
-    const std::vector<int>& shape,
-    const std::vector<StrideT>& strides) {
-  StrideT loc = 0;
+inline int64_t
+elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
+  int64_t loc = 0;
  for (int i = shape.size() - 1; i >= 0; --i) {
    auto q_and_r = ldiv(elem, shape[i]);
    loc += q_and_r.rem * strides[i];
@@ -22,16 +19,15 @@ inline StrideT elem_to_loc(
  return loc;
 }

-inline size_t elem_to_loc(int elem, const array& a) {
+inline int64_t elem_to_loc(int elem, const array& a) {
  if (a.flags().row_contiguous) {
    return elem;
  }
  return elem_to_loc(elem, a.shape(), a.strides());
 }

-template <typename StrideT>
-std::vector<StrideT> make_contiguous_strides(const std::vector<int>& shape) {
-  std::vector<StrideT> strides(shape.size(), 1);
+inline Strides make_contiguous_strides(const Shape& shape) {
+  Strides strides(shape.size(), 1);
  for (int i = shape.size() - 1; i > 0; i--) {
    strides[i - 1] = strides[i] * shape[i];
  }
@@ -44,22 +40,15 @@ std::vector<StrideT> make_contiguous_strides(const std::vector<int>& shape) {
 //
 // When multiple arrays are passed they should all have the same shape. The
 // collapsed axes are also the same so one shape is returned.
-std::tuple<std::vector<int>, std::vector<std::vector<int64_t>>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<int64_t>>& strides,
+std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
+    const Shape& shape,
+    const std::vector<Strides>& strides,
    int64_t size_cap = std::numeric_limits<int32_t>::max());
-std::tuple<std::vector<int>, std::vector<std::vector<size_t>>>
-collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<std::vector<size_t>>& strides,
-    size_t size_cap = std::numeric_limits<int32_t>::max());

-inline std::tuple<std::vector<int>, std::vector<std::vector<size_t>>>
-collapse_contiguous_dims(
+inline std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const std::vector<array>& xs,
    size_t size_cap = std::numeric_limits<int32_t>::max()) {
-  std::vector<std::vector<size_t>> strides;
+  std::vector<Strides> strides;
  for (auto& x : xs) {
    strides.emplace_back(x.strides());
  }
@@ -73,19 +62,14 @@ inline auto collapse_contiguous_dims(Arrays&&... xs) {
 }

 // The single array version of the above.
-std::pair<std::vector<int>, std::vector<int64_t>> collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<int64_t>& strides,
+std::pair<Shape, Strides> collapse_contiguous_dims(
+    const Shape& shape,
+    const Strides& strides,
    int64_t size_cap = std::numeric_limits<int32_t>::max());
-std::pair<std::vector<int>, std::vector<size_t>> collapse_contiguous_dims(
-    const std::vector<int>& shape,
-    const std::vector<size_t>& strides,
-    size_t size_cap = std::numeric_limits<int32_t>::max());
-std::pair<std::vector<int>, std::vector<size_t>> collapse_contiguous_dims(
+std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
-    size_t size_cap = std::numeric_limits<int32_t>::max());
+    int64_t size_cap = std::numeric_limits<int32_t>::max());

-template <typename StrideT>
 struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
@@ -102,7 +86,7 @@ struct ContiguousIterator {
    loc += strides_[i];
  }

-  void seek(StrideT n) {
+  void seek(int64_t n) {
    loc = 0;
    for (int i = shape_.size() - 1; i >= 0; --i) {
      auto q_and_r = ldiv(n, shape_[i]);
@@ -123,37 +107,34 @@ struct ContiguousIterator {
      : shape_(a.shape()), strides_(a.strides()) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
-      pos_ = std::vector<int>(shape_.size(), 0);
+      pos_ = Shape(shape_.size(), 0);
    }
  }

  explicit ContiguousIterator(
-      const std::vector<int>& shape,
-      const std::vector<StrideT>& strides,
+      const Shape& shape,
+      const Strides& strides,
      int dims)
      : shape_(shape.begin(), shape.begin() + dims),
        strides_(strides.begin(), strides.begin() + dims) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
-      pos_ = std::vector<int>(shape_.size(), 0);
+      pos_ = Shape(shape_.size(), 0);
    }
  }

-  StrideT loc{0};
+  int64_t loc{0};

 private:
-  std::vector<int> shape_;
-  std::vector<StrideT> strides_;
-  std::vector<int> pos_;
+  Shape shape_;
+  Strides strides_;
+  Shape pos_;
 };

-template <typename StrideT>
-inline auto check_contiguity(
-    const std::vector<int>& shape,
-    const std::vector<StrideT>& strides) {
+inline auto check_contiguity(const Shape& shape, const Strides& strides) {
  size_t no_broadcast_data_size = 1;
-  size_t f_stride = 1;
-  size_t b_stride = 1;
+  int64_t f_stride = 1;
+  int64_t b_stride = 1;
  bool is_row_contiguous = true;
  bool is_col_contiguous = true;

@@ -182,9 +163,15 @@ void move_or_copy(const array& in, array& out);
 void move_or_copy(
    const array& in,
    array& out,
-    const std::vector<size_t>& strides,
+    const Strides& strides,
    array::Flags flags,
    size_t data_size,
    size_t offset = 0);

+std::pair<bool, Strides> prepare_reshape(const array& in, const array& out);
+
+void shared_buffer_reshape(
+    const array& in,
+    const Strides& out_strides,
+    array& out);
 } // namespace mlx::core
--- a/mlx/backend/metal/allocator.cpp
+++ b/mlx/backend/metal/allocator.cpp
@@ -34,16 +34,20 @@ BufferCache::~BufferCache() {
  clear();
 }

-void BufferCache::clear() {
+int BufferCache::clear() {
+  int n_release = 0;
  for (auto& [size, holder] : buffer_pool_) {
-    if (holder->buf)
+    if (holder->buf) {
      holder->buf->release();
+      n_release++;
+    }
    delete holder;
  }
  buffer_pool_.clear();
  pool_size_ = 0;
  head_ = nullptr;
  tail_ = nullptr;
+  return n_release;
 }

 MTL::Buffer* BufferCache::reuse_from_cache(size_t size) {
@@ -81,10 +85,11 @@ void BufferCache::recycle_to_cache(MTL::Buffer* buf) {
  }
 }

-void BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
+int BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
  if (min_bytes_to_free >= 0.9 * pool_size_) {
-    clear();
+    return clear();
  } else {
+    int n_release = 0;
    size_t total_bytes_freed = 0;

    while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
@@ -92,10 +97,12 @@ void BufferCache::release_cached_buffers(size_t min_bytes_to_free) {
        total_bytes_freed += tail_->buf->length();
        tail_->buf->release();
        tail_->buf = nullptr;
+        n_release++;
      }
      remove_from_list(tail_);
    }
    pool_size_ -= total_bytes_freed;
+    return n_release;
  }
 }

@@ -144,11 +151,11 @@ MetalAllocator::MetalAllocator()
      residency_set_(device_),
      buffer_cache_(device_) {
  auto memsize = std::get<size_t>(device_info()["memory_size"]);
-  block_limit_ =
-      std::min(1.5 * device_->recommendedMaxWorkingSetSize(), 0.95 * memsize);
-  gc_limit_ = std::min(
-      static_cast<size_t>(0.95 * device_->recommendedMaxWorkingSetSize()),
-      block_limit_);
+  auto max_rec_size =
+      std::get<size_t>(device_info()["max_recommended_working_set_size"]);
+  resource_limit_ = std::get<size_t>(device_info()["resource_limit"]);
+  block_limit_ = std::min(1.5 * max_rec_size, 0.95 * memsize);
+  gc_limit_ = std::min(static_cast<size_t>(0.95 * max_rec_size), block_limit_);
  max_pool_size_ = block_limit_;
  device(mlx::core::Device::gpu)
      .set_residency_set(residency_set_.mtl_residency_set());
@@ -186,7 +193,8 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
  // More helpful message if maximum buffer length is exceeded
  if (size > device_->maxBufferLength()) {
    std::ostringstream msg;
-    msg << "Attempting to allocate " << size << " bytes which is greater than"
+    msg << "[metal::malloc] Attempting to allocate " << size
+        << " bytes which is greater than"
        << " the maximum allowed buffer size of " << device_->maxBufferLength()
        << " bytes.";
    throw std::runtime_error(msg.str());
@@ -212,16 +220,26 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {

    // If we have a lot of memory pressure or are over the maximum cache size,
    // try to reclaim memory from the cache
-    if (mem_required >= gc_limit_) {
-      buffer_cache_.release_cached_buffers(mem_required - gc_limit_);
+    if (mem_required >= gc_limit_ || num_resources_ >= resource_limit_) {
+      num_resources_ -=
+          buffer_cache_.release_cached_buffers(mem_required - gc_limit_);
    }

    // Allocate new buffer if needed
    size_t res_opt = MTL::ResourceStorageModeShared;
    res_opt |= MTL::ResourceHazardTrackingModeUntracked;
+    if (num_resources_ >= resource_limit_) {
+      std::ostringstream msg;
+      msg << "[metal::malloc] Resource limit (" << resource_limit_
+          << ") exceeded.";
+      throw std::runtime_error(msg.str());
+    }
    lk.unlock();
    buf = device_->newBuffer(size, res_opt);
    lk.lock();
+    if (buf) {
+      num_resources_++;
+    }
  }

  active_memory_ += buf->length();
@@ -230,7 +248,8 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
  // Maintain the cache below the requested limit
  if (get_cache_memory() >= max_pool_size_) {
    auto pool = metal::new_scoped_memory_pool();
-    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
+    num_resources_ -= buffer_cache_.release_cached_buffers(
+        get_cache_memory() - max_pool_size_);
  }

  residency_set_.insert(buf);
@@ -241,7 +260,7 @@ Buffer MetalAllocator::malloc(size_t size, bool allow_swap /* = false */) {
 void MetalAllocator::clear_cache() {
  std::unique_lock lk(mutex_);
  auto pool = metal::new_scoped_memory_pool();
-  buffer_cache_.clear();
+  num_resources_ -= buffer_cache_.clear();
 }

 void MetalAllocator::free(Buffer buffer) {
@@ -255,6 +274,7 @@ void MetalAllocator::free(Buffer buffer) {
  if (get_cache_memory() < max_pool_size_) {
    buffer_cache_.recycle_to_cache(buf);
  } else {
+    num_resources_--;
    lk.unlock();
    auto pool = metal::new_scoped_memory_pool();
    buf->release();
--- a/mlx/backend/metal/allocator.h
+++ b/mlx/backend/metal/allocator.h
@@ -23,11 +23,11 @@ class BufferCache {

  MTL::Buffer* reuse_from_cache(size_t size);
  void recycle_to_cache(MTL::Buffer* buf);
-  void release_cached_buffers(size_t min_bytes_to_free);
+  int release_cached_buffers(size_t min_bytes_to_free);
  size_t cache_size() {
    return pool_size_;
  }
-  void clear();
+  int clear();

 private:
  struct BufferHolder {
@@ -94,6 +94,8 @@ class MetalAllocator : public allocator::Allocator {
  size_t max_pool_size_;
  size_t wired_limit_{0};
  bool relaxed_{true};
+  size_t num_resources_{0};
+  size_t resource_limit_{0};

  std::mutex mutex_;
 };
--- a/mlx/backend/metal/binary.cpp
+++ b/mlx/backend/metal/binary.cpp
@@ -75,19 +75,21 @@ void binary_op_gpu_inplace(
      auto [shape, strides] = collapse_contiguous_dims(a, b, out);
      return std::make_tuple(shape, strides[0], strides[1], strides[2]);
    } else {
-      std::vector<size_t> e;
-      return std::make_tuple(std::vector<int>{}, e, e, e);
+      decltype(a.strides()) e{};
+      return std::make_tuple(decltype(a.shape()){}, e, e, e);
    }
  };
  auto [shape, strides_a, strides_b, strides_out] = maybe_collapse();

-  bool large = out.data_size() > UINT32_MAX;
+  bool large;
  auto ndim = shape.size();
  int work_per_thread;
  if (bopt == BinaryOpType::General) {
-    large |= (a.data_size() > UINT32_MAX || b.data_size() > UINT32_MAX);
+    large = a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+        out.size() > INT32_MAX;
    work_per_thread = large ? 4 : 2;
  } else {
+    large = out.data_size() > UINT32_MAX;
    work_per_thread = 1;
  }
  std::string kernel_name =
--- a/mlx/backend/metal/compiled.cpp
+++ b/mlx/backend/metal/compiled.cpp
@@ -1,6 +1,5 @@
 // Copyright © 2023-2024 Apple Inc.
 #include <fmt/format.h>
-#include <iostream> //TODO
 #include <sstream>

 #include "mlx/backend/common/compiled.h"
@@ -67,7 +66,7 @@ inline void build_kernel(

  if (add_indices) {
    os += fmt::format(
-        "    constant const size_t* in_strides [[buffer({0})]],\n", cnt++);
+        "    constant const int64_t* in_strides [[buffer({0})]],\n", cnt++);
  }

  // Add the output arguments
@@ -81,7 +80,7 @@ inline void build_kernel(
  // Add output strides and shape to extract the indices.
  if (!contiguous) {
    os += fmt::format(
-        "    constant const size_t* output_strides [[buffer({0})]],\n", cnt++);
+        "    constant const int64_t* output_strides [[buffer({0})]],\n", cnt++);
    os += fmt::format(
        "    constant const int* output_shape [[buffer({0})]],\n", cnt++);
  }
@@ -93,11 +92,11 @@ inline void build_kernel(
  os += "    uint3 pos [[thread_position_in_grid]],\n";
  os += "    uint3 grid [[threads_per_grid]]) {\n";

-  std::string idx_type = use_big_index ? "size_t" : "uint";
+  std::string idx_type = use_big_index ? "int64_t" : "uint";
  if (contiguous && use_big_index) {
    // This is only used for contiguous kernels which don't have
    // a third grid dimension
-    os += "  size_t index = pos.x + grid.x * size_t(pos.y);\n";
+    os += "  int64_t index = pos.x + grid.x * int64_t(pos.y);\n";
  } else if (work_per_thread > 1) {
    os += fmt::format("  constexpr int N_ = {0};\n", work_per_thread);
    os += fmt::format(
@@ -144,20 +143,18 @@ inline void build_kernel(
    os += fmt::format("  {0} index_{1} = ", idx_type, xname);
    if (ndim == 1) {
      int offset = i * ndim;
-      os += fmt::format(
-          "elem_to_loc_1<size_t, uint>(pos.x, in_strides[{0}]);\n", offset);
+      os +=
+          fmt::format("elem_to_loc_1<uint>(pos.x, in_strides[{0}]);\n", offset);
    } else if (ndim == 2) {
      int offset = i * ndim;
      os += fmt::format(
-          "elem_to_loc_2<size_t, {0}>({{pos.x, pos.y}}, in_strides + {1});\n",
+          "elem_to_loc_2<{0}>({{pos.x, pos.y}}, in_strides + {1});\n",
          idx_type,
          offset);
    } else if (ndim == 3) {
      int offset = i * ndim;
      os += fmt::format(
-          "elem_to_loc_3<size_t, {0}>(pos, in_strides + {1});\n",
-          idx_type,
-          offset);
+          "elem_to_loc_3<{0}>(pos, in_strides + {1});\n", idx_type, offset);
    } else if (!dynamic_dims) {
      int offset = (i + 1) * ndim;
      os += fmt::format(
@@ -360,10 +357,10 @@ void Compiled::eval_gpu(

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
-  std::vector<std::vector<size_t>> initial_strides;
+  std::vector<Strides> initial_strides;
  initial_strides.push_back(outputs[0].strides());
-  std::vector<int> shape;
-  std::vector<std::vector<size_t>> strides;
+  Shape shape;
+  std::vector<Strides> strides;
  if (!contiguous) {
    for (int i = 0; i < inputs.size(); i++) {
      // Skip constants.
@@ -378,7 +375,7 @@ void Compiled::eval_gpu(
      }

      // Broadcast the inputs to the output shape.
-      std::vector<size_t> xstrides;
+      Strides xstrides;
      int j = 0;
      for (; j < output_shape.size() - x.ndim(); j++) {
        if (output_shape[j] == 1) {
@@ -440,7 +437,7 @@ void Compiled::eval_gpu(
  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
-  std::vector<size_t> in_strides;
+  Strides in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (constant_ids_.find(inputs_[i].id()) != constant_ids_.end()) {
      continue;
--- a/mlx/backend/metal/conv.cpp
+++ b/mlx/backend/metal/conv.cpp
@@ -34,7 +34,7 @@ void explicit_gemm_conv_ND_gpu(
  int implicit_K = wt.size() / conv_params.O;
  int implicit_N = conv_params.O;
  // Prepare unfolding array
-  std::vector<int> unfolded_shape{implicit_M, implicit_K};
+  Shape unfolded_shape{implicit_M, implicit_K};
  array in_unfolded(unfolded_shape, in.dtype(), nullptr, {});

  in_unfolded.set_data(allocator::malloc_or_wait(in_unfolded.nbytes()));
@@ -64,8 +64,8 @@ void explicit_gemm_conv_ND_gpu(
  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Reshape weight
-  std::vector<int> wt_reshape{implicit_K, implicit_N};
-  std::vector<size_t> wt_restride{1, static_cast<size_t>(implicit_K)};
+  Shape wt_reshape{implicit_K, implicit_N};
+  Strides wt_restride{1, implicit_K};
  array wt_reshaped(wt_reshape, wt.dtype(), nullptr, {});
  auto wt_flags = wt.flags();
  wt_flags.row_contiguous = false;
@@ -113,7 +113,7 @@ void explicit_gemm_conv_group_ND_gpu(
  }

  // Prepare unfolding array
-  std::vector<int> unfolded_shape{implicit_M, implicit_K * groups};
+  Shape unfolded_shape{implicit_M, implicit_K * groups};
  array in_unfolded(unfolded_shape, in.dtype(), nullptr, {});
  in_unfolded.set_data(allocator::malloc_or_wait(in_unfolded.nbytes()));

@@ -147,10 +147,7 @@ void explicit_gemm_conv_group_ND_gpu(
  array wt_view(
      {wt.shape(0), C_per_group, kernel_size}, wt.dtype(), nullptr, {});
  wt_view.copy_shared_buffer(
-      wt,
-      {wt.strides(0), 1, static_cast<size_t>(C_per_group)},
-      wt.flags(),
-      wt.size());
+      wt, {wt.strides(0), 1, C_per_group}, wt.flags(), wt.size());

  // Materialize
  auto wt_transpose = array(wt_view.shape(), wt_view.dtype(), nullptr, {});
@@ -195,12 +192,12 @@ void conv_1D_gpu(
    bool flip) {
  // Make conv params
  MLXConvParams<1> conv_params{
-      /* const int  N = */ in.shape(0),
-      /* const int  C = */ in.shape(2),
-      /* const int  O = */ wt.shape(0),
-      /* const int iS[NDIM] = */ {in.shape(1)},
-      /* const int wS[NDIM] = */ {wt.shape(1)},
-      /* const int oS[NDIM] = */ {out.shape(1)},
+      /* const int  N = */ static_cast<int>(in.shape(0)),
+      /* const int  C = */ static_cast<int>(in.shape(2)),
+      /* const int  O = */ static_cast<int>(wt.shape(0)),
+      /* const int iS[NDIM] = */ {static_cast<int>(in.shape(1))},
+      /* const int wS[NDIM] = */ {static_cast<int>(wt.shape(1))},
+      /* const int oS[NDIM] = */ {static_cast<int>(out.shape(1))},
      /* const int str[NDIM] = */ {wt_strides[0]},
      /* const int pad[NDIM] = */ {padding[0]},
      /* const int kdil[NDIM] = */ {wt_dilation[0]},
@@ -544,7 +541,7 @@ void winograd_conv_2D_gpu(
    array out,
    const MLXConvParams<2>& conv_params,
    std::vector<array>& copies_w) {
-  std::vector<int> padded_shape = {
+  Shape padded_shape = {
      conv_params.N,
      conv_params.iS[0] + 2 * conv_params.pad[0],
      conv_params.iS[1] + 2 * conv_params.pad[1],
@@ -553,7 +550,7 @@ void winograd_conv_2D_gpu(
  padded_shape[1] = 6 * ((padded_shape[1] - 2 + 5) / 6) + 2;
  padded_shape[2] = 6 * ((padded_shape[2] - 2 + 5) / 6) + 2;

-  array in_padded(padded_shape, in.dtype(), nullptr, {});
+  array in_padded(std::move(padded_shape), in.dtype(), nullptr, {});

  // Fill with zeros
  array zero_arr = array(0, in.dtype());
@@ -578,12 +575,16 @@ void winograd_conv_2D_gpu(
  copies_w.push_back(in_padded);

  MLXConvParams<2> conv_params_updated{
-      /* const int  N = */ in_padded.shape(0),
-      /* const int  C = */ in_padded.shape(3),
-      /* const int  O = */ wt.shape(0),
-      /* const int iS[NDIM] = */ {in_padded.shape(1), in_padded.shape(2)},
-      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2)},
-      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2)},
+      /* const int  N = */ static_cast<int>(in_padded.shape(0)),
+      /* const int  C = */ static_cast<int>(in_padded.shape(3)),
+      /* const int  O = */ static_cast<int>(wt.shape(0)),
+      /* const int iS[NDIM] = */
+      {static_cast<int>(in_padded.shape(1)),
+       static_cast<int>(in_padded.shape(2))},
+      /* const int wS[NDIM] = */
+      {static_cast<int>(wt.shape(1)), static_cast<int>(wt.shape(2))},
+      /* const int oS[NDIM] = */
+      {static_cast<int>(out.shape(1)), static_cast<int>(out.shape(2))},
      /* const int str[NDIM] = */ {1, 1},
      /* const int pad[NDIM] = */ {0, 0},
      /* const int kdil[NDIM] = */ {1, 1},
@@ -610,8 +611,8 @@ void winograd_conv_2D_gpu(
  int N_tiles = N_tiles_n * N_tiles_h * N_tiles_w;

  // Do filter transform
-  std::vector<int> filt_wg_shape = {8 * 8, conv_params.C, conv_params.O};
-  array filt_wg(filt_wg_shape, wt.dtype(), nullptr, {});
+  Shape filt_wg_shape = {8 * 8, conv_params.C, conv_params.O};
+  array filt_wg(std::move(filt_wg_shape), wt.dtype(), nullptr, {});
  filt_wg.set_data(allocator::malloc_or_wait(filt_wg.nbytes()));
  copies_w.push_back(filt_wg);
  {
@@ -637,8 +638,8 @@ void winograd_conv_2D_gpu(
  }

  // Do input transform
-  std::vector<int> inp_wg_shape = {8 * 8, N_tiles, conv_params.C};
-  array inp_wg(inp_wg_shape, in.dtype(), nullptr, {});
+  Shape inp_wg_shape = {8 * 8, N_tiles, conv_params.C};
+  array inp_wg(std::move(inp_wg_shape), in.dtype(), nullptr, {});
  inp_wg.set_data(allocator::malloc_or_wait(inp_wg.nbytes()));
  copies_w.push_back(inp_wg);
  {
@@ -664,8 +665,8 @@ void winograd_conv_2D_gpu(
  }

  // Do batched gemm
-  std::vector<int> out_wg_shape = {8 * 8, N_tiles, conv_params.O};
-  array out_wg(out_wg_shape, in.dtype(), nullptr, {});
+  Shape out_wg_shape = {8 * 8, N_tiles, conv_params.O};
+  array out_wg(std::move(out_wg_shape), in.dtype(), nullptr, {});
  out_wg.set_data(allocator::malloc_or_wait(out_wg.nbytes()));
  copies_w.push_back(out_wg);
  {
@@ -726,12 +727,15 @@ void conv_2D_gpu(
    std::vector<array>& copies) {
  // Make conv params
  MLXConvParams<2> conv_params{
-      /* const int  N = */ in.shape(0),
-      /* const int  C = */ in.shape(3),
-      /* const int  O = */ wt.shape(0),
-      /* const int iS[NDIM] = */ {in.shape(1), in.shape(2)},
-      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2)},
-      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2)},
+      /* const int  N = */ static_cast<int>(in.shape(0)),
+      /* const int  C = */ static_cast<int>(in.shape(3)),
+      /* const int  O = */ static_cast<int>(wt.shape(0)),
+      /* const int iS[NDIM] = */
+      {static_cast<int>(in.shape(1)), static_cast<int>(in.shape(2))},
+      /* const int wS[NDIM] = */
+      {static_cast<int>(wt.shape(1)), static_cast<int>(wt.shape(2))},
+      /* const int oS[NDIM] = */
+      {static_cast<int>(out.shape(1)), static_cast<int>(out.shape(2))},
      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1]},
      /* const int pad[NDIM] = */ {padding[0], padding[1]},
      /* const int kdil[NDIM] = */ {wt_dilation[0], wt_dilation[1]},
@@ -803,12 +807,21 @@ void conv_3D_gpu(
    std::vector<array>& copies) {
  // Make conv params
  MLXConvParams<3> conv_params{
-      /* const int  N = */ in.shape(0),
-      /* const int  C = */ in.shape(4),
-      /* const int  O = */ wt.shape(0),
-      /* const int iS[NDIM] = */ {in.shape(1), in.shape(2), in.shape(3)},
-      /* const int wS[NDIM] = */ {wt.shape(1), wt.shape(2), wt.shape(3)},
-      /* const int oS[NDIM] = */ {out.shape(1), out.shape(2), out.shape(3)},
+      /* const int  N = */ static_cast<int>(in.shape(0)),
+      /* const int  C = */ static_cast<int>(in.shape(4)),
+      /* const int  O = */ static_cast<int>(wt.shape(0)),
+      /* const int iS[NDIM] = */
+      {static_cast<int>(in.shape(1)),
+       static_cast<int>(in.shape(2)),
+       static_cast<int>(in.shape(3))},
+      /* const int wS[NDIM] = */
+      {static_cast<int>(wt.shape(1)),
+       static_cast<int>(wt.shape(2)),
+       static_cast<int>(wt.shape(3))},
+      /* const int oS[NDIM] = */
+      {static_cast<int>(out.shape(1)),
+       static_cast<int>(out.shape(2)),
+       static_cast<int>(out.shape(3))},
      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1], wt_strides[2]},
      /* const int pad[NDIM] = */ {padding[0], padding[1], padding[2]},
      /* const int kdil[NDIM] = */
--- a/mlx/backend/metal/copy.cpp
+++ b/mlx/backend/metal/copy.cpp
@@ -43,17 +43,18 @@ void copy_gpu(const array& in, array& out, CopyType ctype) {
  copy_gpu(in, out, ctype, out.primitive().stream());
 }

-template <typename stride_t>
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& strides_in_pre,
-    const std::vector<stride_t>& strides_out_pre,
+    const Shape& data_shape,
+    const Strides& strides_in_pre,
+    const Strides& strides_out_pre,
    int64_t inp_offset,
    int64_t out_offset,
    CopyType ctype,
-    const Stream& s) {
+    const Stream& s,
+    const std::optional<array>& dynamic_i_offset /* = std::nullopt */,
+    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
  if (out.size() == 0) {
    return;
  }
@@ -68,8 +69,8 @@ void copy_gpu_inplace(
              /* size_cap = */ INT32_MAX);
          return std::make_tuple(shape, strides[0], strides[1]);
        } else {
-          std::vector<stride_t> e;
-          return std::make_tuple(std::vector<int>{}, e, e);
+          Strides e{};
+          return std::make_tuple(Shape{}, e, e);
        }
      };
  auto [shape, strides_in_, strides_out_] = maybe_collapse();
@@ -81,6 +82,7 @@ void copy_gpu_inplace(
  } else {
    large = out.data_size() > UINT32_MAX;
  }
+  bool dynamic = dynamic_i_offset || dynamic_o_offset;
  auto& d = metal::device(s.device);
  int work_per_thread = 1;
  std::string kernel_name;
@@ -108,9 +110,17 @@ void copy_gpu_inplace(
    if (large) {
      kernel_name += "large";
    }
+    if (dynamic) {
+      kernel_name += "_dynamic";
+      if (ctype != CopyType::GeneralGeneral) {
+        throw std::runtime_error(
+            "[Copy::eval_gpu] Dynamic output offset requires GeneralGeneral copy");
+      }
+    }
  }
  concatenate(kernel_name, "_copy", type_to_name(in), type_to_name(out));
-  auto kernel = get_copy_kernel(d, kernel_name, in, out);
+  auto kernel = dynamic ? get_dynamic_copy_kernel(d, kernel_name, in, out)
+                        : get_copy_kernel(d, kernel_name, in, out);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
@@ -124,8 +134,8 @@ void copy_gpu_inplace(

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
-    std::vector<int64_t> strides_in{strides_in_.begin(), strides_in_.end()};
-    std::vector<int64_t> strides_out{strides_out_.begin(), strides_out_.end()};
+    Strides strides_in{strides_in_.begin(), strides_in_.end()};
+    Strides strides_out{strides_out_.begin(), strides_out_.end()};
    if (ndim > 3) {
      compute_encoder.set_vector_bytes(shape, ndim, 2);
    }
@@ -146,6 +156,18 @@ void copy_gpu_inplace(
      compute_encoder.set_bytes(ndim, 5);
      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    }
+    if (dynamic) {
+      if (dynamic_i_offset) {
+        compute_encoder.set_input_array(*dynamic_i_offset, 6);
+      } else {
+        compute_encoder.set_bytes(0ll, 6);
+      }
+      if (dynamic_o_offset) {
+        compute_encoder.set_input_array(*dynamic_o_offset, 7);
+      } else {
+        compute_encoder.set_bytes(0ll, 7);
+      }
+    }

    // NB assuming thread_group_size is a power of 2 larger than 32 x 32
    if (thread_group_size != 1024) {
@@ -180,14 +202,13 @@ void copy_gpu_inplace(
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int64_t>& istride,
-    int64_t ioffset,
+    const Strides& i_strides,
+    int64_t i_offset,
    CopyType ctype,
    const Stream& s) {
  assert(in.shape() == out.shape());
-  std::vector<int64_t> ostrides{out.strides().begin(), out.strides().end()};
  return copy_gpu_inplace(
-      in, out, in.shape(), istride, ostrides, ioffset, 0, ctype, s);
+      in, out, in.shape(), i_strides, out.strides(), i_offset, 0, ctype, s);
 }

 void fill_gpu(const array& val, array& out, const Stream& s) {
--- a/mlx/backend/metal/copy.h
+++ b/mlx/backend/metal/copy.h
@@ -8,23 +8,24 @@
 namespace mlx::core {

 // Generic copy inplace
-template <typename stride_t>
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int>& data_shape,
-    const std::vector<stride_t>& i_strides,
-    const std::vector<stride_t>& o_strides,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype,
-    const Stream& s);
+    const Stream& s,
+    const std::optional<array>& dynamic_i_offset = std::nullopt,
+    const std::optional<array>& dynamic_o_offset = std::nullopt);

 void copy_gpu(const array& src, array& out, CopyType ctype, const Stream& s);
 void copy_gpu(const array& src, array& out, CopyType ctype);

 void copy_gpu_inplace(
-    const array& src,
+    const array& in,
    array& out,
    CopyType ctype,
    const Stream& s);
@@ -32,8 +33,8 @@ void copy_gpu_inplace(
 void copy_gpu_inplace(
    const array& in,
    array& out,
-    const std::vector<int64_t>& istride,
-    int64_t ioffset,
+    const Strides& i_strides,
+    int64_t i_offset,
    CopyType ctype,
    const Stream& s);

--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@@ -651,18 +651,23 @@ device_info() {
    auto raw_device = device(default_device()).mtl_device();
    auto arch = std::string(raw_device->architecture()->name()->utf8String());

-    int mib[] = {CTL_HW, HW_MEMSIZE};
    size_t memsize = 0;
    size_t length = sizeof(memsize);
+    sysctlbyname("hw.memsize", &memsize, &length, NULL, 0);

-    sysctl(mib, 2, &memsize, &length, NULL, 0);
+    size_t rsrc_limit = 0;
+    sysctlbyname("iogpu.rsrc_limit", &rsrc_limit, &length, NULL, 0);
+    if (rsrc_limit == 0) {
+      rsrc_limit = 499000;
+    }

    return {
        {"architecture", arch},
        {"max_buffer_length", raw_device->maxBufferLength()},
        {"max_recommended_working_set_size",
         raw_device->recommendedMaxWorkingSetSize()},
-        {"memory_size", memsize}};
+        {"memory_size", memsize},
+        {"resource_limit", rsrc_limit}};
  };
  static auto device_info_ = init_device_info();
  return device_info_;
--- a/mlx/backend/metal/distributed.cpp
+++ b/mlx/backend/metal/distributed.cpp
@@ -3,6 +3,7 @@
 #include <cassert>

 #include "mlx/allocator.h"
+#include "mlx/backend/common/utils.h"
 #include "mlx/backend/metal/device.h"
 #include "mlx/distributed/ops.h"
 #include "mlx/distributed/primitives.h"
@@ -89,13 +90,14 @@ void Send::eval_gpu(

  auto& in = inputs[0];
  auto& out = outputs[0];
+  move_or_copy(in, out);

  // Schedule an async send on the comm stream
  auto task = [in = in, out = out, group = group(), dst = dst_]() mutable {
    if (in.event().valid()) {
      in.event().wait();
    }
-    distributed::detail::send(group, in, dst);
+    distributed::detail::send(group, out, dst);
    out.event().signal();
  };
  scheduler::enqueue(detail::communication_stream(), std::move(task));
@@ -133,6 +135,7 @@ void Recv::eval_gpu(
  // Encode a wait event as there is no input for the recv to encode a signal.
  auto& s = stream();
  auto& d = metal::device(s.device);
+  d.end_encoding(s.index);
  auto command_buffer = d.get_command_buffer(s.index);
  command_buffer->encodeWait(
      static_cast<MTL::Event*>(out.event().raw_event().get()),
--- a/mlx/backend/metal/fft.cpp
+++ b/mlx/backend/metal/fft.cpp
@@ -363,7 +363,7 @@ void multi_upload_bluestein_fft(
  auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);

  // Broadcast w_q and w_k to the batch size
-  std::vector<size_t> b_strides(in.ndim(), 0);
+  Strides b_strides(in.ndim(), 0);
  b_strides[axis] = 1;
  array w_k_broadcast({}, complex64, nullptr, {});
  array w_q_broadcast({}, complex64, nullptr, {});
@@ -386,8 +386,8 @@ void multi_upload_bluestein_fft(
    copies.push_back(slice_temp);
    copies.push_back(conj_temp);

-    std::vector<int> rstarts(in.ndim(), 0);
-    std::vector<int> rstrides(in.ndim(), 1);
+    Shape rstarts(in.ndim(), 0);
+    Shape rstrides(in.ndim(), 1);
    rstarts[axis] = in.shape(axis) - back_offset;
    rstrides[axis] = -1;
    unary_op_gpu({in}, conj_temp, "Conjugate", s);
@@ -431,19 +431,19 @@ void multi_upload_bluestein_fft(
      s);

  int offset = plan.bluestein_n - (2 * n - 1);
-  std::vector<int> starts(in.ndim(), 0);
-  std::vector<int> strides(in.ndim(), 1);
+  Shape starts(in.ndim(), 0);
+  Shape strides(in.ndim(), 1);
  starts[axis] = plan.bluestein_n - offset - n;
  slice_gpu(pad_temp1, temp, starts, strides, s);

  binary_op_gpu_inplace({temp, w_k_broadcast}, temp1, "Multiply", s);

  if (real && !inverse) {
-    std::vector<int> rstarts(in.ndim(), 0);
-    std::vector<int> rstrides(in.ndim(), 1);
+    Shape rstarts(in.ndim(), 0);
+    Shape rstrides(in.ndim(), 1);
    slice_gpu(temp1, out, rstarts, strides, s);
  } else if (real && inverse) {
-    std::vector<size_t> b_strides(in.ndim(), 0);
+    Strides b_strides(in.ndim(), 0);
    auto inv_n = array({1.0f / n}, {1}, float32);
    array temp_float(out.shape(), out.dtype(), nullptr, {});
    copies.push_back(temp_float);
@@ -531,8 +531,8 @@ void fft_op(
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
-      std::vector<size_t> strides;
-      size_t cur_stride = x.shape(axis);
+      Strides strides;
+      int64_t cur_stride = x.shape(axis);
      for (int a = 0; a < x.ndim(); a++) {
        if (a == axis) {
          strides.push_back(1);
@@ -777,7 +777,7 @@ void nd_fft_op(
    // Mirror np.fft.(i)rfftn and perform a real transform
    // only on the final axis.
    bool step_real = (real && index == axes.size() - 1);
-    int step_shape = inverse ? out.shape(axis) : in.shape(axis);
+    auto step_shape = inverse ? out.shape(axis) : in.shape(axis);
    const array& in_arr = i == axes.size() - 1 ? in : temp_arrs[1 - i % 2];
    array& out_arr = i == 0 ? out : temp_arrs[i % 2];
    fft_op(in_arr, out_arr, axis, inverse, step_real, inplace, s);
--- a/mlx/backend/metal/indexing.cpp
+++ b/mlx/backend/metal/indexing.cpp
@@ -53,9 +53,9 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  size_t ndim = src.ndim();

-  bool large_index = nidx && inputs[1].size() > UINT32_MAX;
-  bool large_src = src.size() > UINT32_MAX;
-  bool large_out = out.size() > UINT32_MAX;
+  bool large_index = nidx && inputs[1].size() > INT32_MAX;
+  bool large_src = src.size() > INT32_MAX;
+  bool large_out = out.size() > INT32_MAX;
  bool large = large_index || large_src || large_out;

  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
@@ -65,7 +65,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
      idx_type_name,
      nidx,
      idx_ndim,
-      large ? "size_t" : "uint");
+      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;

  auto lib = d.get_library(lib_name, [&]() {
@@ -86,7 +86,7 @@ void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
        idx_args,
        idx_arr,
        idx_ndim,
-        large ? "size_t" : "uint");
+        large ? "int64_t" : "int");
    return kernel_source;
  });

@@ -234,9 +234,9 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
      break;
  }
  auto upd_contig = upd.flags().row_contiguous;
-  bool large_out = out.size() > UINT32_MAX;
-  bool large_idx = nidx && (inputs[1].size() > UINT32_MAX);
-  bool large_upd = upd.size() > UINT32_MAX;
+  bool large_out = out.size() > INT32_MAX;
+  bool large_idx = nidx && (inputs[1].size() > INT32_MAX);
+  bool large_upd = upd.size() > INT32_MAX;
  bool large = large_out || large_idx || large_upd;
  std::string kernel_name = fmt::format(
      "scatter{0}{1}_{2}_{3}_{4}_nwork{5}_{6}",
@@ -246,7 +246,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
      nidx,
      upd_contig ? "updc_true" : "updc_false",
      nwork,
-      large ? "size_t" : "uint");
+      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;

  auto lib = d.get_library(lib_name, [&]() {
@@ -290,7 +290,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
        idx_arr,
        upd_contig,
        nwork,
-        large ? "size_t" : "uint");
+        large ? "int64_t" : "int");
    return kernel_source;
  });

@@ -312,8 +312,8 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
    upd_size *= upd.shape(i);
  }
  // Collect all idx shapes and strides into one place
-  std::vector<int> idx_shapes;
-  std::vector<size_t> idx_strides;
+  Shape idx_shapes;
+  Strides idx_strides;
  // To access .data() use char instead of bool
  // bool is 1 byte in Metal so this is safe
  std::vector<char> idx_contigs;
@@ -332,7 +332,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (upd_ndim == 0) {
    // Need placeholders so Metal doesn't compalain
    int shape_ = 0;
-    size_t stride_ = 0;
+    int64_t stride_ = 0;
    compute_encoder.set_bytes(shape_, 3);
    compute_encoder.set_bytes(stride_, 4);
  } else {
@@ -347,7 +347,7 @@ void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (out_ndim == 0) {
    // Need placeholders so Metal doesn't compalain
    int shape_ = 0;
-    size_t stride_ = 0;
+    int64_t stride_ = 0;
    compute_encoder.set_bytes(shape_, 7);
    compute_encoder.set_bytes(stride_, 8);
  } else {
--- a/mlx/backend/metal/jit/gemv_masked.h
+++ b/mlx/backend/metal/jit/gemv_masked.h
@@ -11,13 +11,13 @@ gemv_{trans}masked<{itype}, {outm_t}, {opm_t}, {bm}, {bn}, {sm}, {sn}, {tm}, {tn
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device {outm_t}* out_mask [[buffer(20)]],
    const device {opm_t}* mat_mask [[buffer(21)]],
    const device {opm_t}* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
-    const constant size_t* mask_batch_strides [[buffer(24)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
--- a/mlx/backend/metal/jit/indexing.h
+++ b/mlx/backend/metal/jit/indexing.h
@@ -5,12 +5,12 @@ constexpr std::string_view gather_kernels = R"(
    const device {1}* src [[buffer(0)]],
    device {1}* out [[buffer(1)]],
    const constant int* src_shape [[buffer(2)]],
-    const constant size_t* src_strides [[buffer(3)]],
+    const constant int64_t* src_strides [[buffer(3)]],
    const constant size_t& src_ndim [[buffer(4)]],
    const constant int* slice_sizes [[buffer(5)]],
    const constant int* axes [[buffer(6)]],
    const constant int* idx_shapes [[buffer(7)]],
-    const constant size_t* idx_strides [[buffer(8)]],
+    const constant int64_t* idx_strides [[buffer(8)]],
    const constant bool* idx_contigs [[buffer(9)]],
    const constant int& idx_ndim [[buffer(10)]],
    {4}
@@ -38,15 +38,15 @@ constexpr std::string_view scatter_kernels = R"(
    const device {1}* updates [[buffer(1)]],
    device mlx_atomic<{1}>* out [[buffer(2)]],
    const constant int* upd_shape [[buffer(3)]],
-    const constant size_t* upd_strides [[buffer(4)]],
+    const constant int64_t* upd_strides [[buffer(4)]],
    const constant size_t& upd_ndim [[buffer(5)]],
    const constant size_t& upd_size [[buffer(6)]],
    const constant int* out_shape [[buffer(7)]],
-    const constant size_t* out_strides [[buffer(8)]],
+    const constant int64_t* out_strides [[buffer(8)]],
    const constant size_t& out_ndim [[buffer(9)]],
    const constant int* axes [[buffer(10)]],
    const constant int* idx_shapes [[buffer(11)]],
-    const constant size_t* idx_strides [[buffer(12)]],
+    const constant int64_t* idx_strides [[buffer(12)]],
    const constant bool* idx_contigs [[buffer(13)]],
    const constant int& idx_ndim [[buffer(14)]],
    const constant size_t& idx_size [[buffer(15)]],
--- a/mlx/backend/metal/jit/steel_gemm.h
+++ b/mlx/backend/metal/jit/steel_gemm.h
@@ -10,12 +10,12 @@ template [[host_name("{name}")]]
    const constant GEMMParams* params [[buffer(4)]],
    const constant GEMMAddMMParams* addmm_params [[buffer(5), function_constant(use_out_source)]],
    const constant int* batch_shape [[buffer(6)]],
-    const constant size_t* batch_strides [[buffer(7)]],
+    const constant int64_t* batch_strides [[buffer(7)]],
    const constant uint32_t* lhs_indices [[buffer(10), function_constant(do_gather)]],
    const constant uint32_t* rhs_indices [[buffer(11), function_constant(do_gather)]],
    const constant uint32_t* C_indices [[buffer(12), function_constant(gather_bias)]],
    const constant int* operand_shape [[buffer(13), function_constant(do_gather)]],
-    const constant size_t* operand_strides [[buffer(14), function_constant(do_gather)]],
+    const constant int64_t* operand_strides [[buffer(14), function_constant(do_gather)]],
    const constant packed_int3& operand_batch_ndim [[buffer(15), function_constant(do_gather)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
@@ -43,7 +43,7 @@ block_masked_gemm<
    device {itype}* D [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    const constant int* batch_shape [[buffer(6)]],
-    const constant size_t* batch_strides [[buffer(7)]],
+    const constant int64_t* batch_strides [[buffer(7)]],
    const device {outmasktype}* out_mask [[buffer(10)]],
    const device {opmasktype}* lhs_mask [[buffer(11)]],
    const device {opmasktype}* rhs_mask [[buffer(12)]],
--- a/mlx/backend/metal/jit_kernels.cpp
+++ b/mlx/backend/metal/jit_kernels.cpp
@@ -52,7 +52,7 @@ MTL::ComputePipelineState* get_unary_kernel(
    kernel_source +=
        get_template_definition("v2_" + lib_name, "unary_v2", in_t, out_t, op);
    kernel_source += get_template_definition(
-        "gn1_" + lib_name, "unary_g", in_t, out_t, op, 1, "uint");
+        "gn1_" + lib_name, "unary_g", in_t, out_t, op, 1, "int");
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "unary_g", in_t, out_t, op, 4);
    return kernel_source;
@@ -74,7 +74,7 @@ void append_binary_kernels(
      {"vs2", "binary_vs2"},
      {"sv2", "binary_sv2"},
      {"vv2", "binary_vv2"},
-      {"g1", "binary_g_nd1"},
+      {"g1large", "binary_g_nd1"},
      {"g2large", "binary_g_nd2"},
      {"g3large", "binary_g_nd3"},
  }};
@@ -86,11 +86,13 @@ void append_binary_kernels(
        get_template_definition(name + "_" + lib_name, func, in_t, out_t, op);
  }
  kernel_source += get_template_definition(
-      "g2_" + lib_name, "binary_g_nd2", in_t, out_t, op, "uint");
+      "g1_" + lib_name, "binary_g_nd1", in_t, out_t, op, "int");
  kernel_source += get_template_definition(
-      "g3_" + lib_name, "binary_g_nd3", in_t, out_t, op, "uint");
+      "g2_" + lib_name, "binary_g_nd2", in_t, out_t, op, "int");
  kernel_source += get_template_definition(
-      "gn2_" + lib_name, "binary_g", in_t, out_t, op, 2, "uint");
+      "g3_" + lib_name, "binary_g_nd3", in_t, out_t, op, "int");
+  kernel_source += get_template_definition(
+      "gn2_" + lib_name, "binary_g", in_t, out_t, op, 2, "int");
  kernel_source += get_template_definition(
      "gn4large_" + lib_name, "binary_g", in_t, out_t, op, 4);
 }
@@ -141,7 +143,7 @@ MTL::ComputePipelineState* get_ternary_kernel(
    const std::array<std::pair<std::string, std::string>, 5> kernel_types = {{
        {"v", "ternary_v"},
        {"v2", "ternary_v2"},
-        {"g1", "ternary_g_nd1"},
+        {"g1large", "ternary_g_nd1"},
        {"g2large", "ternary_g_nd2"},
        {"g3large", "ternary_g_nd3"},
    }};
@@ -150,11 +152,13 @@ MTL::ComputePipelineState* get_ternary_kernel(
          get_template_definition(name + "_" + lib_name, func, t_str, op);
    }
    kernel_source += get_template_definition(
-        "g2_" + lib_name, "ternary_g_nd2", t_str, op, "uint");
+        "g1_" + lib_name, "ternary_g_nd1", t_str, op, "int");
    kernel_source += get_template_definition(
-        "g3_" + lib_name, "ternary_g_nd3", t_str, op, "uint");
+        "g2_" + lib_name, "ternary_g_nd2", t_str, op, "int");
    kernel_source += get_template_definition(
-        "gn2_" + lib_name, "ternary_g", t_str, op, 2, "uint");
+        "g3_" + lib_name, "ternary_g_nd3", t_str, op, "int");
+    kernel_source += get_template_definition(
+        "gn2_" + lib_name, "ternary_g", t_str, op, 2, "int");
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "ternary_g", t_str, op, 4);
    return kernel_source;
@@ -178,7 +182,7 @@ MTL::ComputePipelineState* get_copy_kernel(
    kernel_source +=
        get_template_definition("v_" + lib_name, "copy_v", in_type, out_type);
    kernel_source += get_template_definition(
-        "g1_" + lib_name, "copy_g_nd1", in_type, out_type);
+        "g1_" + lib_name, "copy_g_nd1", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "g2_" + lib_name, "copy_g_nd2", in_type, out_type, "int");
    kernel_source += get_template_definition(
@@ -186,19 +190,23 @@ MTL::ComputePipelineState* get_copy_kernel(
    kernel_source += get_template_definition(
        "gn2_" + lib_name, "copy_g", in_type, out_type, 2, "int");
    kernel_source += get_template_definition(
-        "gg1_" + lib_name, "copy_gg_nd1", in_type, out_type);
+        "gg1_" + lib_name, "copy_gg_nd1", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg2_" + lib_name, "copy_gg_nd2", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg3_" + lib_name, "copy_gg_nd3", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "ggn2_" + lib_name, "copy_gg", in_type, out_type, 2, "int");
+    kernel_source += get_template_definition(
+        "g1large_" + lib_name, "copy_g_nd1", in_type, out_type);
    kernel_source += get_template_definition(
        "g2large_" + lib_name, "copy_g_nd2", in_type, out_type);
    kernel_source += get_template_definition(
        "g3large_" + lib_name, "copy_g_nd3", in_type, out_type);
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "copy_g", in_type, out_type, 4);
+    kernel_source += get_template_definition(
+        "gg1large_" + lib_name, "copy_gg_nd1", in_type, out_type);
    kernel_source += get_template_definition(
        "gg2large_" + lib_name, "copy_gg_nd2", in_type, out_type);
    kernel_source += get_template_definition(
@@ -210,6 +218,38 @@ MTL::ComputePipelineState* get_copy_kernel(
  return d.get_kernel(kernel_name, lib);
 }

+MTL::ComputePipelineState* get_dynamic_copy_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out) {
+  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
+  auto lib = d.get_library(lib_name, [&]() {
+    std::string kernel_source = metal::utils();
+    kernel_source += metal::copy();
+    auto in_type = get_type_string(in.dtype());
+    auto out_type = get_type_string(out.dtype());
+    kernel_source += get_template_definition(
+        "gg1_" + lib_name, "copy_gg_dynamic_nd1", in_type, out_type, "int");
+    kernel_source += get_template_definition(
+        "gg2_" + lib_name, "copy_gg_dynamic_nd2", in_type, out_type, "int");
+    kernel_source += get_template_definition(
+        "gg3_" + lib_name, "copy_gg_dynamic_nd3", in_type, out_type, "int");
+    kernel_source += get_template_definition(
+        "ggn2_" + lib_name, "copy_gg_dynamic", in_type, out_type, 2, "int");
+    kernel_source += get_template_definition(
+        "gg1large_" + lib_name, "copy_gg_dynamic_nd1", in_type, out_type);
+    kernel_source += get_template_definition(
+        "gg2large_" + lib_name, "copy_gg_dynamic_nd2", in_type, out_type);
+    kernel_source += get_template_definition(
+        "gg3large_" + lib_name, "copy_gg_dynamic_nd3", in_type, out_type);
+    kernel_source += get_template_definition(
+        "ggn4large_" + lib_name, "copy_gg_dynamic", in_type, out_type, 4);
+    return kernel_source;
+  });
+  return d.get_kernel(kernel_name, lib);
+}
+
 MTL::ComputePipelineState* get_softmax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/mlx/backend/metal/kernels.h
+++ b/mlx/backend/metal/kernels.h
@@ -45,6 +45,12 @@ MTL::ComputePipelineState* get_copy_kernel(
    const array& in,
    const array& out);

+MTL::ComputePipelineState* get_dynamic_copy_kernel(
+    metal::Device& d,
+    const std::string& kernel_name,
+    const array& in,
+    const array& out);
+
 MTL::ComputePipelineState* get_softmax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
--- a/mlx/backend/metal/kernels/arg_reduce.metal
+++ b/mlx/backend/metal/kernels/arg_reduce.metal
@@ -75,10 +75,10 @@ template <typename T, typename Op, int N_READS = 4>
    const device T* in [[buffer(0)]],
    device uint32_t* out [[buffer(1)]],
    const constant int* shape [[buffer(2)]],
-    const constant size_t* in_strides [[buffer(3)]],
-    const constant size_t* out_strides [[buffer(4)]],
+    const constant int64_t* in_strides [[buffer(3)]],
+    const constant int64_t* out_strides [[buffer(4)]],
    const constant size_t& ndim [[buffer(5)]],
-    const constant size_t& axis_stride [[buffer(6)]],
+    const constant int64_t& axis_stride [[buffer(6)]],
    const constant size_t& axis_size [[buffer(7)]],
    uint gid [[thread_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
--- a/mlx/backend/metal/kernels/binary.h
+++ b/mlx/backend/metal/kernels/binary.h
@@ -43,7 +43,7 @@ template <typename T, typename U, typename Op>
    device U* c,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  int64_t offset = index.x + grid_dim.x * int64_t(index.y);
  c[offset] = Op()(a[0], b[offset]);
 }

@@ -54,7 +54,7 @@ template <typename T, typename U, typename Op>
    device U* c,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  int64_t offset = index.x + grid_dim.x * int64_t(index.y);
  c[offset] = Op()(a[offset], b[0]);
 }

@@ -65,49 +65,49 @@ template <typename T, typename U, typename Op>
    device U* c,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  int64_t offset = index.x + grid_dim.x * int64_t(index.y);
  c[offset] = Op()(a[offset], b[offset]);
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd1(
    device const T* a,
    device const T* b,
    device U* c,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
+    constant const int64_t& a_stride,
+    constant const int64_t& b_stride,
    uint index [[thread_position_in_grid]]) {
-  auto a_idx = elem_to_loc_1<size_t, IdxT>(index, a_stride);
-  auto b_idx = elem_to_loc_1<size_t, IdxT>(index, b_stride);
+  auto a_idx = elem_to_loc_1<IdxT>(index, a_stride);
+  auto b_idx = elem_to_loc_1<IdxT>(index, b_stride);
  c[index] = Op()(a[a_idx], b[b_idx]);
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd2(
    device const T* a,
    device const T* b,
    device U* c,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
+    constant const int64_t a_strides[2],
+    constant const int64_t b_strides[2],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_2<size_t, IdxT>(index, a_strides);
-  auto b_idx = elem_to_loc_2<size_t, IdxT>(index, b_strides);
+  auto a_idx = elem_to_loc_2<IdxT>(index, a_strides);
+  auto b_idx = elem_to_loc_2<IdxT>(index, b_strides);
  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;
  c[out_idx] = Op()(a[a_idx], b[b_idx]);
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd3(
    device const T* a,
    device const T* b,
    device U* c,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
+    constant const int64_t a_strides[3],
+    constant const int64_t b_strides[3],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_3<size_t, IdxT>(index, a_strides);
-  auto b_idx = elem_to_loc_3<size_t, IdxT>(index, b_strides);
+  auto a_idx = elem_to_loc_3<IdxT>(index, a_strides);
+  auto b_idx = elem_to_loc_3<IdxT>(index, b_strides);
  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
  c[out_idx] = Op()(a[a_idx], b[b_idx]);
 }
@@ -117,18 +117,18 @@ template <
    typename U,
    typename Op,
    int N = 1,
-    typename IdxT = size_t>
+    typename IdxT = int64_t>
 [[kernel]] void binary_g(
    device const T* a,
    device const T* b,
    device U* c,
    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
    constant const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd<size_t, IdxT>(
+  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);
  auto xshape = shape[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
--- a/mlx/backend/metal/kernels/binary.metal
+++ b/mlx/backend/metal/kernels/binary.metal
@@ -9,21 +9,21 @@
 #include "mlx/backend/metal/kernels/binary_ops.h"
 #include "mlx/backend/metal/kernels/binary.h"

-#define instantiate_binary_all(op, tname, itype, otype)                        \
-  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)            \
-  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op)            \
-  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op)            \
-  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op)            \
-  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)          \
-  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)          \
-  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)          \
-  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, uint)   \
-  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4)    \
-  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, uint)   \
-  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op)    \
-  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, uint)   \
-  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op)    \
-  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, uint)   \
+#define instantiate_binary_all(op, tname, itype, otype)                     \
+  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)         \
+  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op)         \
+  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op)         \
+  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op)         \
+  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)       \
+  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)       \
+  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)       \
+  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, int) \
+  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4) \
+  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, int) \
+  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op) \
+  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, int) \
+  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op) \
+  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, int) \
  instantiate_kernel("g3large_" #op #tname, binary_g_nd3, itype, otype, op)

 #define instantiate_binary_integer(op)                   \
--- a/mlx/backend/metal/kernels/binary_two.h
+++ b/mlx/backend/metal/kernels/binary_two.h
@@ -56,7 +56,7 @@ template <typename T, typename U, typename Op>
    device U* d,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  auto offset = index.x + grid_dim.x * int64_t(index.y);
  auto out = Op()(a[0], b[offset]);
  c[offset] = out[0];
  d[offset] = out[1];
@@ -70,7 +70,7 @@ template <typename T, typename U, typename Op>
    device U* d,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  auto offset = index.x + grid_dim.x * int64_t(index.y);
  auto out = Op()(a[offset], b[0]);
  c[offset] = out[0];
  d[offset] = out[1];
@@ -84,58 +84,58 @@ template <typename T, typename U, typename Op>
    device U* d,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  auto offset = index.x + grid_dim.x * int64_t(index.y);
  auto out = Op()(a[offset], b[offset]);
  c[offset] = out[0];
  d[offset] = out[1];
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd1(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
-    constant const size_t& a_stride,
-    constant const size_t& b_stride,
+    constant const int64_t& a_stride,
+    constant const int64_t& b_stride,
    uint index [[thread_position_in_grid]]) {
-  auto a_idx = elem_to_loc_1<size_t, IdxT>(index, a_stride);
-  auto b_idx = elem_to_loc_1<size_t, IdxT>(index, b_stride);
+  auto a_idx = elem_to_loc_1<IdxT>(index, a_stride);
+  auto b_idx = elem_to_loc_1<IdxT>(index, b_stride);
  auto out = Op()(a[a_idx], b[b_idx]);
  c[index] = out[0];
  d[index] = out[1];
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd2(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
-    constant const size_t a_strides[2],
-    constant const size_t b_strides[2],
+    constant const int64_t a_strides[2],
+    constant const int64_t b_strides[2],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_2<size_t, IdxT>(index, a_strides);
-  auto b_idx = elem_to_loc_2<size_t, IdxT>(index, b_strides);
+  auto a_idx = elem_to_loc_2<IdxT>(index, a_strides);
+  auto b_idx = elem_to_loc_2<IdxT>(index, b_strides);
  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;
  auto out = Op()(a[a_idx], b[b_idx]);
  c[out_idx] = out[0];
  d[out_idx] = out[1];
 }

-template <typename T, typename U, typename Op, typename IdxT = size_t>
+template <typename T, typename U, typename Op, typename IdxT = int64_t>
 [[kernel]] void binary_g_nd3(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
-    constant const size_t a_strides[3],
-    constant const size_t b_strides[3],
+    constant const int64_t a_strides[3],
+    constant const int64_t b_strides[3],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto a_idx = elem_to_loc_3<size_t, IdxT>(index, a_strides);
-  auto b_idx = elem_to_loc_3<size_t, IdxT>(index, b_strides);
+  auto a_idx = elem_to_loc_3<IdxT>(index, a_strides);
+  auto b_idx = elem_to_loc_3<IdxT>(index, b_strides);
  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
  auto out = Op()(a[a_idx], b[b_idx]);
  c[out_idx] = out[0];
@@ -147,19 +147,19 @@ template <
    typename U,
    typename Op,
    int N = 1,
-    typename IdxT = size_t>
+    typename IdxT = int64_t>
 [[kernel]] void binary_g(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant const int* shape,
-    constant const size_t* a_strides,
-    constant const size_t* b_strides,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
    constant const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto idx = elem_to_loc_2_nd<size_t, IdxT>(
+  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);
  auto xshape = shape[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
--- a/mlx/backend/metal/kernels/binary_two.metal
+++ b/mlx/backend/metal/kernels/binary_two.metal
@@ -7,21 +7,21 @@
 #include "mlx/backend/metal/kernels/binary_ops.h"
 #include "mlx/backend/metal/kernels/binary_two.h"

-#define instantiate_binary_all(op, tname, itype, otype)                      \
-  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)          \
-  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op)          \
-  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op)          \
-  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op)          \
-  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)        \
-  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)        \
-  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)        \
-  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, uint) \
-  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4)  \
-  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, uint) \
-  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, uint) \
-  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, uint) \
-  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op)  \
-  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op)  \
+#define instantiate_binary_all(op, tname, itype, otype)                     \
+  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)         \
+  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op)         \
+  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op)         \
+  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op)         \
+  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)       \
+  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)       \
+  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)       \
+  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, int) \
+  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4) \
+  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, int) \
+  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, int) \
+  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, int) \
+  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op) \
+  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op) \
  instantiate_kernel("g3large_" #op #tname, binary_g_nd3, itype, otype, op)

 #define instantiate_binary_float(op)                \
--- a/mlx/backend/metal/kernels/copy.h
+++ b/mlx/backend/metal/kernels/copy.h
@@ -22,7 +22,7 @@ template <typename T, typename U>
    device U* dst [[buffer(1)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  auto offset = index.x + grid_dim.x * int64_t(index.y);
  dst[offset] = static_cast<U>(src[0]);
 }

@@ -32,7 +32,7 @@ template <typename T, typename U>
    device U* dst [[buffer(1)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  size_t offset = index.x + grid_dim.x * size_t(index.y);
+  auto offset = index.x + grid_dim.x * int64_t(index.y);
  dst[offset] = static_cast<U>(src[offset]);
 }

@@ -42,7 +42,7 @@ template <typename T, typename U, typename IdxT = int64_t>
    device U* dst [[buffer(1)]],
    constant const int64_t& src_stride [[buffer(3)]],
    uint index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_1<int64_t, IdxT>(index, src_stride);
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
  dst[index] = static_cast<U>(src[src_idx]);
 }

@@ -53,7 +53,7 @@ template <typename T, typename U, typename IdxT = int64_t>
    constant const int64_t* src_strides [[buffer(3)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc_2<int64_t, IdxT>(index, src_strides);
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
  IdxT dst_idx = index.x + IdxT(grid_dim.x) * index.y;
  dst[dst_idx] = static_cast<U>(src[src_idx]);
 }
@@ -65,7 +65,7 @@ template <typename T, typename U, typename IdxT = int64_t>
    constant const int64_t* src_strides [[buffer(3)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc_3<int64_t, IdxT>(index, src_strides);
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
  IdxT dst_idx =
      index.x + IdxT(grid_dim.x) * (index.y + IdxT(grid_dim.y) * index.z);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
@@ -80,7 +80,7 @@ template <typename T, typename U, int N = 1, typename IdxT = int64_t>
    constant const int& ndim [[buffer(5)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
-  auto src_idx = elem_to_loc<int64_t, IdxT>(
+  auto src_idx = elem_to_loc<IdxT>(
      {N * index.x, index.y, index.z}, src_shape, src_strides, ndim);
  if (N == 1) {
    IdxT dst_idx =
@@ -104,8 +104,8 @@ template <typename T, typename U, typename IdxT = int64_t>
    constant const int64_t& src_stride [[buffer(3)]],
    constant const int64_t& dst_stride [[buffer(4)]],
    uint index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_1<int64_t, IdxT>(index, src_stride);
-  auto dst_idx = elem_to_loc_1<int64_t, IdxT>(index, dst_stride);
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
+  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
 }

@@ -116,8 +116,8 @@ template <typename T, typename U, typename IdxT = int64_t>
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    uint2 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_2<int64_t, IdxT>(index, src_strides);
-  auto dst_idx = elem_to_loc_2<int64_t, IdxT>(index, dst_strides);
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
 }

@@ -128,8 +128,8 @@ template <typename T, typename U, typename IdxT = int64_t>
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    uint3 index [[thread_position_in_grid]]) {
-  auto src_idx = elem_to_loc_3<int64_t, IdxT>(index, src_strides);
-  auto dst_idx = elem_to_loc_3<int64_t, IdxT>(index, dst_strides);
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
 }

@@ -142,7 +142,7 @@ template <typename T, typename U, int N = 1, typename IdxT = int64_t>
    constant const int64_t* dst_strides [[buffer(4)]],
    constant const int& ndim [[buffer(5)]],
    uint3 index [[thread_position_in_grid]]) {
-  auto idx = elem_to_loc_2_nd<int64_t, IdxT>(
+  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z},
      src_shape,
      src_strides,
@@ -161,3 +161,78 @@ template <typename T, typename U, int N = 1, typename IdxT = int64_t>
    idx.y += dst_xstride;
  }
 }
+
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    constant const int64_t& dst_stride [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
+  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint2 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+
+template <typename T, typename U, int N = 1, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int& ndim [[buffer(5)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint3 index [[thread_position_in_grid]]) {
+  src += src_offset;
+  dst += dst_offset;
+  auto idx = elem_to_loc_2_nd<IdxT>(
+      {N * index.x, index.y, index.z},
+      src_shape,
+      src_strides,
+      dst_strides,
+      ndim);
+  if (N == 1) {
+    dst[idx.y] = src[idx.x];
+    return;
+  }
+  IdxT src_xstride = src_strides[ndim - 1];
+  IdxT dst_xstride = dst_strides[ndim - 1];
+  auto xshape = src_shape[ndim - 1];
+  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
+    dst[idx.y] = src[idx.x];
+    idx.x += src_xstride;
+    idx.y += dst_xstride;
+  }
+}
--- a/mlx/backend/metal/kernels/copy.metal
+++ b/mlx/backend/metal/kernels/copy.metal
@@ -4,29 +4,40 @@
 #include "mlx/backend/metal/kernels/utils.h"
 #include "mlx/backend/metal/kernels/copy.h"

-#define instantiate_copy_all(tname, itype, otype)                       \
-  instantiate_kernel("s_copy" #tname, copy_s, itype, otype)             \
-  instantiate_kernel("v_copy" #tname, copy_v, itype, otype)             \
-  instantiate_kernel("s2_copy" #tname, copy_s2, itype, otype)           \
-  instantiate_kernel("v2_copy" #tname, copy_v2, itype, otype)           \
-  instantiate_kernel("g1_copy" #tname, copy_g_nd1, itype, otype, int)   \
-  instantiate_kernel("g2_copy" #tname, copy_g_nd2, itype, otype, int)   \
-  instantiate_kernel("g3_copy" #tname, copy_g_nd3, itype, otype, int)   \
-  instantiate_kernel("gg1_copy" #tname, copy_gg_nd1, itype, otype, int) \
-  instantiate_kernel("gg2_copy" #tname, copy_gg_nd2, itype, otype, int) \
-  instantiate_kernel("gg3_copy" #tname, copy_gg_nd3, itype, otype, int) \
-  instantiate_kernel("gn2_copy" #tname, copy_g, itype, otype, 2, int)   \
-  instantiate_kernel("ggn2_copy" #tname, copy_gg, itype, otype, 2, int) \
-  instantiate_kernel("g1large_copy" #tname, copy_g_nd1, itype, otype)   \
-  instantiate_kernel("g2large_copy" #tname, copy_g_nd2, itype, otype)   \
-  instantiate_kernel("g3large_copy" #tname, copy_g_nd3, itype, otype)   \
-  instantiate_kernel("gg1large_copy" #tname, copy_gg_nd1, itype, otype) \
-  instantiate_kernel("gg2large_copy" #tname, copy_gg_nd2, itype, otype) \
-  instantiate_kernel("gg3large_copy" #tname, copy_gg_nd3, itype, otype) \
-  instantiate_kernel("gn4large_copy" #tname, copy_g, itype, otype, 4)   \
-  instantiate_kernel("ggn4large_copy" #tname, copy_gg, itype, otype, 4)
+#define instantiate_copy_all(tname, itype, otype)                     \
+  instantiate_kernel("s_copy" #tname, copy_s, itype, otype)           \
+  instantiate_kernel("v_copy" #tname, copy_v, itype, otype)           \
+  instantiate_kernel("s2_copy" #tname, copy_s2, itype, otype)         \
+  instantiate_kernel("v2_copy" #tname, copy_v2, itype, otype)         \
+  instantiate_kernel("g1_copy" #tname, copy_g_nd1, itype, otype, int) \
+  instantiate_kernel("g2_copy" #tname, copy_g_nd2, itype, otype, int) \
+  instantiate_kernel("g3_copy" #tname, copy_g_nd3, itype, otype, int) \
+  instantiate_kernel("gn2_copy" #tname, copy_g, itype, otype, 2, int) \
+  instantiate_kernel("g1large_copy" #tname, copy_g_nd1, itype, otype) \
+  instantiate_kernel("g2large_copy" #tname, copy_g_nd2, itype, otype) \
+  instantiate_kernel("g3large_copy" #tname, copy_g_nd3, itype, otype) \
+  instantiate_kernel("gn4large_copy" #tname, copy_g, itype, otype, 4)
+
+#define instantiate_copy_same(tname, type)                                            \
+  instantiate_kernel("gg1_copy" #tname, copy_gg_nd1, type, type, int)                 \
+  instantiate_kernel("gg2_copy" #tname, copy_gg_nd2, type, type, int)                 \
+  instantiate_kernel("gg3_copy" #tname, copy_gg_nd3, type, type, int)                 \
+  instantiate_kernel("ggn2_copy" #tname, copy_gg, type, type, 2, int)                 \
+  instantiate_kernel("gg1large_copy" #tname, copy_gg_nd1, type, type)                 \
+  instantiate_kernel("gg2large_copy" #tname, copy_gg_nd2, type, type)                 \
+  instantiate_kernel("gg3large_copy" #tname, copy_gg_nd3, type, type)                 \
+  instantiate_kernel("ggn4large_copy" #tname, copy_gg, type, type, 4)                 \
+  instantiate_kernel("gg1_dynamic_copy" #tname, copy_gg_dynamic_nd1, type, type, int) \
+  instantiate_kernel("gg2_dynamic_copy" #tname, copy_gg_dynamic_nd2, type, type, int) \
+  instantiate_kernel("gg3_dynamic_copy" #tname, copy_gg_dynamic_nd3, type, type, int) \
+  instantiate_kernel("ggn2_dynamic_copy" #tname, copy_gg_dynamic, type, type, 2, int) \
+  instantiate_kernel("gg1large_dynamic_copy" #tname, copy_gg_dynamic_nd1, type, type) \
+  instantiate_kernel("gg2large_dynamic_copy" #tname, copy_gg_dynamic_nd2, type, type) \
+  instantiate_kernel("gg3large_dynamic_copy" #tname, copy_gg_dynamic_nd3, type, type) \
+  instantiate_kernel("ggn4large_dynamic_copy" #tname, copy_gg_dynamic, type, type, 4)

 #define instantiate_copy_itype(itname, itype)                \
+  instantiate_copy_same(itname ##itname, itype)              \
  instantiate_copy_all(itname ##bool_, itype, bool)          \
  instantiate_copy_all(itname ##uint8, itype, uint8_t)       \
  instantiate_copy_all(itname ##uint16, itype, uint16_t)     \
--- a/mlx/backend/metal/kernels/gather.h
+++ b/mlx/backend/metal/kernels/gather.h
@@ -9,7 +9,7 @@ METAL_FUNC void gather_impl(
    const device T* src [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant int* src_shape [[buffer(2)]],
-    const constant size_t* src_strides [[buffer(3)]],
+    const constant int64_t* src_strides [[buffer(3)]],
    const constant size_t& src_ndim [[buffer(4)]],
    const constant int* slice_sizes [[buffer(5)]],
    const constant int* axes [[buffer(6)]],
@@ -27,7 +27,7 @@ METAL_FUNC void gather_impl(
      idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);
      idx_loc += indices.row_contiguous[i]
          ? index.y
-          : elem_to_loc<size_t, LocT>(
+          : elem_to_loc<LocT>(
                index.y,
                &indices.shapes[indices.ndim * i + 1],
                &indices.strides[indices.ndim * i + 1],
@@ -39,7 +39,7 @@ METAL_FUNC void gather_impl(
  }

  auto src_offset =
-      elem_to_loc<size_t, LocT>(index.z, slice_sizes, src_strides, src_ndim);
+      elem_to_loc<LocT>(index.z, slice_sizes, src_strides, src_ndim);

  LocT out_idx = index.z;
  if (IDX_NDIM == 1) {
--- a/mlx/backend/metal/kernels/gemv.metal
+++ b/mlx/backend/metal/kernels/gemv.metal
@@ -436,9 +436,9 @@ template <
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
-    const constant size_t* bias_batch_stride [[buffer(13)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* bias_batch_stride [[buffer(13)]],
    const constant int& bias_stride [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
@@ -486,31 +486,21 @@ template <
      simd_lid);
 }

-#define instantiate_gemv_helper(                                             \
-    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                          \
-  template [[host_name("gemv_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
-                       "_tm" #tm "_tn" #tn "_nc" #nc                         \
-                       "_axpby" #axpby)]] [[kernel]] void                    \
-  gemv<itype, bm, bn, sm, sn, tm, tn, nc, axpby>(                            \
-      const device itype* mat [[buffer(0)]],                                 \
-      const device itype* in_vec [[buffer(1)]],                              \
-      const device itype* bias [[buffer(2)]],                                \
-      device itype* out_vec [[buffer(3)]],                                   \
-      const constant int& in_vec_size [[buffer(4)]],                         \
-      const constant int& out_vec_size [[buffer(5)]],                        \
-      const constant int& marix_ld [[buffer(6)]],                            \
-      const constant float& alpha [[buffer(7)]],                             \
-      const constant float& beta [[buffer(8)]],                              \
-      const constant int& batch_ndim [[buffer(9)]],                          \
-      const constant int* batch_shape [[buffer(10)]],                        \
-      const constant size_t* vector_batch_stride [[buffer(11)]],             \
-      const constant size_t* matrix_batch_stride [[buffer(12)]],             \
-      const constant size_t* bias_batch_stride [[buffer(13)]],               \
-      const constant int& bias_stride [[buffer(14)]],                        \
-      uint3 tid [[threadgroup_position_in_grid]],                            \
-      uint3 lid [[thread_position_in_threadgroup]],                          \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],                      \
-      uint simd_lid [[thread_index_in_simdgroup]]);
+#define instantiate_gemv_helper(                                      \
+    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                   \
+  instantiate_kernel(                                                 \
+      "gemv_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm \
+      "_tn" #tn "_nc" #nc "_axpby" #axpby,                            \
+      gemv,                                                           \
+      itype,                                                          \
+      bm,                                                             \
+      bn,                                                             \
+      sm,                                                             \
+      sn,                                                             \
+      tm,                                                             \
+      tn,                                                             \
+      nc,                                                             \
+      axpby)

 // clang-format off
 #define instantiate_gemv(name, itype, bm, bn, tm, tn)              \
@@ -549,13 +539,13 @@ template <
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* index_batch_strides [[buffer(11)]],
+    const constant int64_t* index_batch_strides [[buffer(11)]],
    const constant int& vector_batch_ndim [[buffer(12)]],
    const constant int* vector_batch_shape [[buffer(13)]],
-    const constant size_t* vector_batch_stride [[buffer(14)]],
+    const constant int64_t* vector_batch_stride [[buffer(14)]],
    const constant int& matrix_batch_ndim [[buffer(15)]],
    const constant int* matrix_batch_shape [[buffer(16)]],
-    const constant size_t* matrix_batch_stride [[buffer(17)]],
+    const constant int64_t* matrix_batch_stride [[buffer(17)]],
    const constant uint32_t* vec_indices [[buffer(18)]],
    const constant uint32_t* mat_indices [[buffer(19)]],
    uint3 tid [[threadgroup_position_in_grid]],
@@ -571,8 +561,8 @@ template <

  // Update batch offsets
  if (batch_ndim > 1) {
-    const constant size_t* veci_bstrides = index_batch_strides;
-    const constant size_t* mati_bstrides = index_batch_strides + batch_ndim;
+    const constant auto* veci_bstrides = index_batch_strides;
+    const constant auto* mati_bstrides = index_batch_strides + batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, veci_bstrides, mati_bstrides, batch_ndim);
@@ -619,37 +609,14 @@ template <
      simd_lid);
 }

-#define instantiate_gemv_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn)   \
-  template [[host_name("gemv_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
-                       "_sn" #sn "_tm" #tm "_tn" #tn)]] [[kernel]] void \
-  gemv_gather<itype, bm, bn, sm, sn, tm, tn>(                           \
-      const device itype* mat [[buffer(0)]],                            \
-      const device itype* in_vec [[buffer(1)]],                         \
-      const device itype* bias [[buffer(2)]],                           \
-      device itype* out_vec [[buffer(3)]],                              \
-      const constant int& in_vec_size [[buffer(4)]],                    \
-      const constant int& out_vec_size [[buffer(5)]],                   \
-      const constant int& marix_ld [[buffer(6)]],                       \
-      const constant float& alpha [[buffer(7)]],                        \
-      const constant float& beta [[buffer(8)]],                         \
-      const constant int& batch_ndim [[buffer(9)]],                     \
-      const constant int* batch_shape [[buffer(10)]],                   \
-      const constant size_t* index_batch_strides [[buffer(11)]],        \
-      const constant int& vector_batch_ndim [[buffer(12)]],             \
-      const constant int* vector_batch_shape [[buffer(13)]],            \
-      const constant size_t* vector_batch_stride [[buffer(14)]],        \
-      const constant int& matrix_batch_ndim [[buffer(15)]],             \
-      const constant int* matrix_batch_shape [[buffer(16)]],            \
-      const constant size_t* matrix_batch_stride [[buffer(17)]],        \
-      const constant uint32_t* vec_indices [[buffer(18)]],              \
-      const constant uint32_t* mat_indices [[buffer(19)]],              \
-      uint3 tid [[threadgroup_position_in_grid]],                       \
-      uint3 lid [[thread_position_in_threadgroup]],                     \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],                 \
-      uint simd_lid [[thread_index_in_simdgroup]]);
-
 // clang-format off
-#define instantiate_gemv_bs_blocks(name, itype)        \
+#define instantiate_gemv_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn) \
+  instantiate_kernel(                                                 \
+    "gemv_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm                  \
+                       "_sn" #sn "_tm" #tm "_tn" #tn,                 \
+    gemv_gather, itype, bm, bn, sm, sn, tm, tn)
+
+#define instantiate_gemv_bs_blocks(name, itype)              \
  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 1, 4) \
  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 4, 4) \
  instantiate_gemv_bs_helper(name, itype, 8, 1, 1, 32, 4, 4) // clang-format on
@@ -684,9 +651,9 @@ template <
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
-    const constant size_t* bias_batch_stride [[buffer(13)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* bias_batch_stride [[buffer(13)]],
    const constant int& bias_stride [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
@@ -734,33 +701,14 @@ template <
      simd_lid);
 }

-#define instantiate_gemv_t_helper(                                             \
-    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                            \
-  template [[host_name("gemv_t_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
-                       "_tm" #tm "_tn" #tn "_nc" #nc                           \
-                       "_axpby" #axpby)]] [[kernel]] void                      \
-  gemv_t<itype, bm, bn, sm, sn, tm, tn, nc, axpby>(                            \
-      const device itype* mat [[buffer(0)]],                                   \
-      const device itype* in_vec [[buffer(1)]],                                \
-      const device itype* bias [[buffer(2)]],                                  \
-      device itype* out_vec [[buffer(3)]],                                     \
-      const constant int& in_vec_size [[buffer(4)]],                           \
-      const constant int& out_vec_size [[buffer(5)]],                          \
-      const constant int& marix_ld [[buffer(6)]],                              \
-      const constant float& alpha [[buffer(7)]],                               \
-      const constant float& beta [[buffer(8)]],                                \
-      const constant int& batch_ndim [[buffer(9)]],                            \
-      const constant int* batch_shape [[buffer(10)]],                          \
-      const constant size_t* vector_batch_stride [[buffer(11)]],               \
-      const constant size_t* matrix_batch_stride [[buffer(12)]],               \
-      const constant size_t* bias_batch_stride [[buffer(13)]],                 \
-      const constant int& bias_stride [[buffer(14)]],                          \
-      uint3 tid [[threadgroup_position_in_grid]],                              \
-      uint3 lid [[thread_position_in_threadgroup]],                            \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],                        \
-      uint simd_lid [[thread_index_in_simdgroup]]);
-
 // clang-format off
+#define instantiate_gemv_t_helper(                          \
+    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)         \
+  instantiate_kernel(                                       \
+    "gemv_t_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
+       "_tm" #tm "_tn" #tn "_nc" #nc "_axpby" #axpby,       \
+  gemv_t, itype, bm, bn, sm, sn, tm, tn, nc, axpby)
+
 #define instantiate_gemv_t(name, itype, bm, bn, sm, sn, tm, tn)        \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 0) \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 1) \
@@ -800,13 +748,13 @@ template <
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* index_batch_strides [[buffer(11)]],
+    const constant int64_t* index_batch_strides [[buffer(11)]],
    const constant int& vector_batch_ndim [[buffer(12)]],
    const constant int* vector_batch_shape [[buffer(13)]],
-    const constant size_t* vector_batch_stride [[buffer(14)]],
+    const constant int64_t* vector_batch_stride [[buffer(14)]],
    const constant int& matrix_batch_ndim [[buffer(15)]],
    const constant int* matrix_batch_shape [[buffer(16)]],
-    const constant size_t* matrix_batch_stride [[buffer(17)]],
+    const constant int64_t* matrix_batch_stride [[buffer(17)]],
    const constant uint32_t* vec_indices [[buffer(18)]],
    const constant uint32_t* mat_indices [[buffer(19)]],
    uint3 tid [[threadgroup_position_in_grid]],
@@ -822,8 +770,8 @@ template <

  // Update batch offsets
  if (batch_ndim > 1) {
-    const constant size_t* veci_bstrides = index_batch_strides;
-    const constant size_t* mati_bstrides = index_batch_strides + batch_ndim;
+    const constant auto* veci_bstrides = index_batch_strides;
+    const constant auto* mati_bstrides = index_batch_strides + batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, veci_bstrides, mati_bstrides, batch_ndim);
@@ -870,36 +818,14 @@ template <
      simd_lid);
 }

-#define instantiate_gemv_t_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn)   \
-  template [[host_name("gemv_t_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
-                       "_sn" #sn "_tm" #tm "_tn" #tn)]] [[kernel]] void   \
-  gemv_t_gather<itype, bm, bn, sm, sn, tm, tn>(                           \
-      const device itype* mat [[buffer(0)]],                              \
-      const device itype* in_vec [[buffer(1)]],                           \
-      const device itype* bias [[buffer(2)]],                             \
-      device itype* out_vec [[buffer(3)]],                                \
-      const constant int& in_vec_size [[buffer(4)]],                      \
-      const constant int& out_vec_size [[buffer(5)]],                     \
-      const constant int& marix_ld [[buffer(6)]],                         \
-      const constant float& alpha [[buffer(7)]],                          \
-      const constant float& beta [[buffer(8)]],                           \
-      const constant int& batch_ndim [[buffer(9)]],                       \
-      const constant int* batch_shape [[buffer(10)]],                     \
-      const constant size_t* index_batch_strides [[buffer(11)]],          \
-      const constant int& vector_batch_ndim [[buffer(12)]],               \
-      const constant int* vector_batch_shape [[buffer(13)]],              \
-      const constant size_t* vector_batch_stride [[buffer(14)]],          \
-      const constant int& matrix_batch_ndim [[buffer(15)]],               \
-      const constant int* matrix_batch_shape [[buffer(16)]],              \
-      const constant size_t* matrix_batch_stride [[buffer(17)]],          \
-      const constant uint32_t* vec_indices [[buffer(18)]],                \
-      const constant uint32_t* mat_indices [[buffer(19)]],                \
-      uint3 tid [[threadgroup_position_in_grid]],                         \
-      uint3 lid [[thread_position_in_threadgroup]],                       \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],                   \
-      uint simd_lid [[thread_index_in_simdgroup]]);
-
 // clang-format off
+#define instantiate_gemv_t_bs_helper(                  \
+    nm, itype, bm, bn, sm, sn, tm, tn)                 \
+  instantiate_kernel(                                  \
+    "gemv_t_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
+       "_sn" #sn "_tm" #tm "_tn" #tn,                  \
+  gemv_t_gather, itype, bm, bn, sm, sn, tm, tn)
+
 #define instantiate_gemv_t_bs_blocks(name, itype)              \
  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 1) \
  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 4) \
--- a/mlx/backend/metal/kernels/gemv_masked.h
+++ b/mlx/backend/metal/kernels/gemv_masked.h
@@ -642,13 +642,13 @@ template <
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device out_mask_t* out_mask [[buffer(20)]],
    const device op_mask_t* mat_mask [[buffer(21)]],
    const device op_mask_t* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
-    const constant size_t* mask_batch_strides [[buffer(24)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
@@ -673,8 +673,8 @@ template <
    }

    if (has_operand_mask) {
-      const constant size_t* mask_strides_mat = mask_batch_strides;
-      const constant size_t* mask_strides_vec = mask_strides_mat + batch_ndim;
+      const constant auto* mask_strides_mat = mask_batch_strides;
+      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
@@ -742,13 +742,13 @@ template <
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
-    const constant size_t* vector_batch_stride [[buffer(11)]],
-    const constant size_t* matrix_batch_stride [[buffer(12)]],
+    const constant int64_t* vector_batch_stride [[buffer(11)]],
+    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device out_mask_t* out_mask [[buffer(20)]],
    const device op_mask_t* mat_mask [[buffer(21)]],
    const device op_mask_t* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
-    const constant size_t* mask_batch_strides [[buffer(24)]],
+    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
@@ -773,8 +773,8 @@ template <
    }

    if (has_operand_mask) {
-      const constant size_t* mask_strides_mat = mask_batch_strides;
-      const constant size_t* mask_strides_vec = mask_strides_mat + batch_ndim;
+      const constant auto* mask_strides_mat = mask_batch_strides;
+      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Angelos Katharopoulos	1ce0c0fcb0	Bump version (#1761 )	2025-01-09 13:48:20 -08:00
Awni Hannun	657f466402	use sdpa and exportable functions in transformer multi head attention (#1760 )	2025-01-09 13:11:55 -08:00
Alex Barron	c7b0300af5	Fix batched qmv bug (#1758 )	2025-01-09 11:45:57 -08:00
Awni Hannun	da8c885784	Simplify removes no-ops from the tape (#1759 ) * simplify removes no-ops from the tape * comment	2025-01-09 11:23:19 -08:00
Awni Hannun	1ccaf80575	Dynamic broadcasting for shapeless compile/export (#1722 ) * working towards dynamic broadcast * shapeless broadcast * fix build + nits * use broadcast arrays in quantize matmul * some cleanup / consistency * mend * some comments * add vjp, jvp for broadcast axes	2025-01-09 11:04:24 -08:00
Cheng	ec36bfa317	Include command stdout in error message (#1756 ) * Include command stdout in error message * On Windows pclose returns the exit code	2025-01-08 07:17:03 -08:00
Cheng	b8f76f717a	Print exceptions in eval_cpu/eval_gpu and abort (#1754 )	2025-01-08 06:31:09 -08:00
Awni Hannun	d1766f2c70	Add boolean mask support in vector SDPA (#1757 )	2025-01-07 20:24:53 -08:00
Awni Hannun	516ded618b	Dynamic slicing (#1741 ) * dynamic slice and slice update * python bindings + tests + fix set item * fix compile issue * comment * fix jit	2025-01-07 14:02:16 -08:00
Jesper Stemann Andersen	c9c81d0584	Added additional missing unordered_map include that fixes build on FreeBSD (#1755 )	2025-01-07 08:27:55 -08:00
Angelos Katharopoulos	545f84d905	Refactor distributed backend (#1752 )	2025-01-06 17:33:15 -08:00
Awni Hannun	d5ec172c95	Allow boolean mask in sdpa (#1753 ) * allow boolean mask in sdpa * more permissive donation in ternary	2025-01-06 16:57:07 -08:00
Angelos Katharopoulos	25b3a3e541	Optionally specify names for arrays when exporting (#1749 )	2025-01-06 13:07:46 -08:00
Awni Hannun	058d6ce683	mpi send use input as output (#1750 ) * mpi send use input as output * move earlier	2025-01-06 06:08:43 -08:00
Angelos Katharopoulos	eab93985b8	Update custom function docs (#1748 )	2025-01-03 16:35:25 -08:00
Awni Hannun	b51d70a83c	export docs (#1747 )	2025-01-03 15:04:17 -08:00
Awni Hannun	259025100e	Fix nd ternary on GPU (#1746 )	2025-01-03 11:52:17 -08:00
Awni Hannun	c9d30aa6ac	MLX in C++ example (#1736 ) * MLX in C++ example * nits * fix docs	2025-01-02 19:09:04 -08:00
Angelos Katharopoulos	8544b42007	Add namespace (#1745 )	2025-01-02 16:49:23 -08:00
Awni Hannun	6fa0501387	Fix concatenate/slice_update vjp + reduce binary size (#1735 ) * fix concatenate vjp + reduce binary size * also cast in slice update	2025-01-02 16:36:33 -08:00
Awni Hannun	ae69cb15e9	shapeless compile in docs and partially shapeless reshape (#1742 )	2025-01-02 16:24:42 -08:00
Awni Hannun	a64a8dfe45	fix extension (#1740 )	2025-01-02 16:16:16 -08:00
Venkata Naga Aditya Datta Chivukula	491fa95b1f	Added Kronecker Product (#1728 )	2025-01-02 16:00:34 -08:00
Danilo Peixoto	92ec632ad5	Fix Distributed Communication documentation (#1731 ) * Add missing `size()` method call for group	2025-01-02 14:08:38 -08:00
Cheng	8ecdfb718b	Fix export.cpp compilation with MSVC (#1737 )	2024-12-29 06:56:30 -08:00
Awni Hannun	4ba0c24a8f	Export / import functions to / from a file (#1642 ) * export and import functions * refactor + works for few primitives * nit * allow primitives with state * nit * nit * simplify serialize / deserialize * fix for constants * python bindings * maybe fix serialize failure case * add example * more primitives, training kind of works * same result for python and c++ * some fixes * fix export * template it up * some simplificatoin * rebase * allow kwargs and multiple functions * exporter * more primitives for exporting * deal with endianness * handle invalid stream * add docstring	2024-12-24 11:19:13 -08:00
Cheng	935c8c4bb1	Make mx.compile work on Windows (#1697 ) * Invoke MSVC on Windows in mx.compile * Export kernel symbol on MSVC * Remove unused template * Parse env pairs in a robust way * No need of cassert * Remove unnecessary helpers * Fix right trim * Move command building to a separate file * Missing header * Do not pollute cwd with cl.exe * Simplify str concat * Pass output dir * Fix styling	2024-12-24 07:02:33 -08:00
Valentin Roussellet	88f993da38	Explicit parentheses around some logical operators (#1732 ) * fix some warnings * format	2024-12-24 07:02:20 -08:00
Awni Hannun	ebfe64b92d	shapeless slice update and broadcast when possible (#1727 )	2024-12-23 11:25:15 -08:00
Awni Hannun	0308e9af71	Allow offset to be an mx.array for `mx.fast.rope` (#1724 ) * allow offset for rope * comment	2024-12-19 15:51:44 -08:00
Awni Hannun	c3628eea49	Add `mx.finfo` and use it when making causal mask (#1726 ) * finfo * fixes * docs	2024-12-19 14:52:41 -08:00
Awni Hannun	e03f0372b1	More shape type (#1705 ) * more shape type * fix	2024-12-19 08:08:20 -08:00
Alex Barron	f17536af9c	More lenient mask type check in SDPA (#1723 ) * check mask type * require promotion	2024-12-18 19:41:38 -08:00
Cheng	ed4ec81bca	Link python extension with mlx statically on Windows (#1716 ) * Link python extension with mlx statically on Windows * More readable code	2024-12-18 19:26:04 -08:00
Awni Hannun	7480059306	track resource limit and throw if exceeded (#1718 )	2024-12-18 18:45:58 -08:00
Awni Hannun	8bae22b0fa	fix deletion of non-evaled arrays with siblings (#1714 )	2024-12-18 18:45:36 -08:00
Alex Barron	49c34c4161	check mask type (#1721 )	2024-12-18 14:25:18 -08:00
Awni Hannun	5548fcc96d	fix synch race (#1719 )	2024-12-18 12:25:16 -08:00
Cheng	070bd433ab	Shorter kernel name for Windows (#1701 ) * Shorter kernel name for Windows * Only hash the clipped part	2024-12-17 18:51:38 -08:00
Cheng	c8fb54951a	Define NOMINMAX before windows.h (#1715 )	2024-12-17 18:51:24 -08:00
Awni Hannun	f110357aaa	Bump nanobind to 2.4 + fix (#1710 ) * bump nanobind to 2.4 + fix * fix	2024-12-17 10:57:54 -08:00
Tomohiro Oga	a6b426422e	add cubic to type hinting for upsample (#1709 )	2024-12-17 07:30:23 -08:00
Awni Hannun	d03c01dfbc	fix unflatten vjp (#1708 )	2024-12-16 18:37:57 -08:00
Jesper Stemann Andersen	a82996e9fb	io/load: Enabled pread implementation for mingw32 (#1706 )	2024-12-16 07:20:45 -08:00
Cheng	af5a614aad	Eval before cleanup so model file is unlocked (#1702 )	2024-12-14 21:41:49 -08:00
Cheng	f9640e049d	Install mlx.dll into the same dir with python bindings on Windows (#1690 ) * Install mlx.dll into the same dir with python bindings on Windows * Set BUILD_SHARED_LIBS for dlfcn-win32 * Update cmake requirements to 3.25 * Fix cmake style	2024-12-13 19:50:39 -08:00
Cheng	4768c61b57	Make sure gguf_ctx is closed when error happens (#1699 )	2024-12-13 19:50:19 -08:00
Cheng	dfccd17ab9	Use psutil to get memory info on Windows (#1700 )	2024-12-13 19:50:13 -08:00
Cheng	635117c5d4	Read/write files in binary mode (#1698 )	2024-12-13 17:37:05 -08:00
Awni Hannun	50f3535693	Use expand_dims / unflatten / etc in more places (#1696 ) * use expand_dims / unflatten in a couple more places * few more * few more * fix	2024-12-12 17:00:44 -08:00
Awni Hannun	9111999af3	Fix small sort with metal validation (#1695 )	2024-12-12 09:21:45 -08:00
Awni Hannun	6bd28d246e	Allow no copy negative strides in as_strided and slice (#1688 ) * allow no copy negative strides in as_strided and slice * fix jit * fix jit	2024-12-12 08:59:45 -08:00
Cheng	4d595a2a39	Make compiled preamble work in MSVC (#1675 ) * Make compiled preamble work in MSVC * Remove logging * Only use powershell for MSVC	2024-12-12 08:55:49 -08:00
Awni Hannun	3a21f61772	Fix build (#1693 )	2024-12-11 23:56:25 -08:00
Awni Hannun	4e1e9520e1	Flatten and unflatten (#1692 ) * flatten and unflatten * fix grad * fix shape infer * use squeeze + unsqueeze in get_item	2024-12-11 21:51:37 -08:00
Cheng	0bf19037ca	Remove "using namespace mlx::core" in python/src (#1689 )	2024-12-11 15:45:39 -08:00
Awni Hannun	f3dfa36a3a	Fix x86 tests (#1691 ) * fix x86 tests * comment	2024-12-11 07:47:18 -08:00
Cheng	4f9b60dd53	Remove "using namespace mlx::core" in benchmarks/examples (#1685 ) * Remove "using namespace mlx::core" in benchmarks/examples * Fix building example extension * A missing one in comment * Fix building on M chips	2024-12-11 07:08:29 -08:00
Awni Hannun	f76a49e555	`ExpandDims` primitive (#1687 ) * add squeeze primitive * simplify squeeze, use in gather * fix * fix * fix * fix * fix no cpu * use squeeze in matmul and friends * expand dims primitive * comment	2024-12-10 16:39:07 -08:00
Cheng	310ad8d9db	Build OpenBLAS from source code for MSVC (#1674 ) * Download OpenBLAS binaries when building with MSVC * Download dlfcn-win32 * Link with dlfcn-win32 correctly * Build OpenBLAS from source code * Link with openblas statically * Link with BLAS privately	2024-12-10 16:14:44 -08:00
Cheng	56db268f47	Provide a pread implementation for MSVC (#1666 )	2024-12-10 15:55:53 -08:00
Cheng	92ab6bdeb8	Fix shared library not exporting symbols on Windows (#1684 ) * Fix shared library not exporting symbols on Windows * Function name style	2024-12-10 13:59:14 -08:00
Cheng	0070e360a1	Disable MSVC warnings (#1680 )	2024-12-09 19:41:14 -08:00
Amethyst Shen	9df8fed046	Metal-cpp version bump (#1668 ) * Metal-cpp version bump Apple has released the stable version of Metal-cpp for macOS 15 and iOS 18. CMakeLists.txt is updated to build with it instead of the beta one. * Fix style with cmake-format	2024-12-09 19:40:35 -08:00
Cheng	a59fae040f	Fix library output directory for MSVC (#1681 )	2024-12-09 19:07:50 -08:00
Awni Hannun	29a620cab2	No reshapes in quantized embedding (#1682 ) * no reshapes in quantized embedding * fix inadvertant cast * add tol	2024-12-09 18:57:38 -08:00
Cheng	87d7a2520e	Use Py_ssize_t in python bindings (#1678 ) * Use Py_ssize_t in python bindings * Args passed to std::max must be same type	2024-12-09 12:59:19 -08:00
Awni Hannun	40c62c1321	Use int64 stride everywhere (#1671 ) * use int64 stride everywhere * fix ext * fix ext * more shape + cleanup * one more * few more	2024-12-09 11:09:02 -08:00
Awni Hannun	35b412c099	Fix compile hasher for string constants. (#1677 ) * fix hash * add test * nit	2024-12-09 09:26:18 -08:00
Cheng	d0f471cff7	Using math defines requires switch in MSVC (#1665 ) * Using math defines requires switch in MSVC * Fix more math macros * Fix type * Remove _MSC_VER guard for math defines	2024-12-08 08:16:28 -08:00
Cheng	6f316b8bf5	Use int64_t instead of ssize_t (#1673 )	2024-12-07 20:10:44 -08:00
Cheng	7c10c93a1f	Convert filesystem path to std::string explicitly (#1672 )	2024-12-07 20:10:06 -08:00
Cheng	d92ea094f1	Use && instead of and (#1663 ) * Use && instead of and * Remove "and" in ops.cpp	2024-12-07 18:26:39 -08:00
Cheng	6ae5423b4a	Do not pass integers to isnan (#1664 )	2024-12-07 18:26:23 -08:00
Cheng	9635cffdc8	Include io.h in MSVC for IO functions (#1661 )	2024-12-07 18:26:06 -08:00
Cheng	96986fb362	Use auto* for pointers (#1662 )	2024-12-07 18:25:40 -08:00
Cheng	3ceb341a75	Use correct complex type for MSVC (#1660 )	2024-12-07 18:25:22 -08:00