jagrit's commit files

2025-06-24 01:17:26 +08:00 · 2023-11-29 10:52:08 -08:00 · 2023-11-29 10:52:08 -08:00 · e6306cfee9
commit e6306cfee9
parent d1f86272a2
74 changed files with 15964 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,61 @@
-# mlx
-MLX: An array framework for Apple silicon
+# MLX
+
+MLX is an array framework for machine learning specifically targeting Apple
+Silicon. MLX is designed with inspiration from Jax, PyTorch, ArrayFire.
+
+[Documentation](https://at.apple.com/mlx)
+
+## Build
+
+```
+mkdir -p build && cd build
+cmake .. && make -j
+```
+
+Run the C++ tests with `make test` (or `./tests/tests` for more detailed output).
+
+### Python bidings
+
+To install run:
+
+`
+env CMAKE_BUILD_PARALLEL_LEVEL="" pip install .
+`
+
+For developing use an editable install:
+
+```
+env CMAKE_BUILD_PARALLEL_LEVEL="" pip install -e .
+```
+
+To make sure the install is working run the tests with:
+
+```
+python -m unittest discover python/tests
+```
+
+
+## Develop
+
+- Fork and submit pull requests to the repo. 
+
+- Every PR should have passing tests and at least one review.
+
+- If a change is likely to impact efficiency, run some of the benchmarks before
+  and after the change. Examples of benchmarks can be found in `benchmarks/cpp/`.
+
+- Install `pre-commit` using something like `pip install pre-commit` and run `pre-commit install`.
+  This should install hooks for running `black` and `clang-format` to ensure
+  consistent style for C++ and python code.
+
+  You can also run the formatters manually as follows:
+
+    ```
+    clang-format -i file.cpp
+    ```
+
+    ```
+    black file.py
+    ```
+
+    or run `pre-commit run --all-files` to check all files in the repo.
--- a/benchmarks/cpp/CMakeLists.txt
+++ b/benchmarks/cpp/CMakeLists.txt
@ -0,0 +1,11 @@
+function(build_benchmark SRCFILE)
+  get_filename_component(src_name ${SRCFILE} NAME_WE)
+  set(target "${src_name}")
+  add_executable(${target} ${SRCFILE})
+  target_link_libraries(${target} PRIVATE mlx)
+endfunction(build_benchmark)
+
+build_benchmark(single_ops.cpp)
+build_benchmark(irregular_strides.cpp)
+build_benchmark(compare_devices.cpp)
+build_benchmark(autograd.cpp)
--- a/benchmarks/cpp/autograd.cpp
+++ b/benchmarks/cpp/autograd.cpp
@ -0,0 +1,37 @@
+#include <iostream>
+
+#include "mlx/mlx.h"
+#include "time_utils.h"
+
+using namespace mlx::core;
+
+void time_value_and_grad() {
+  auto x = ones({200, 1000});
+  eval(x);
+  auto fn = [](array x) {
+    for (int i = 0; i < 20; ++i) {
+      x = log(exp(x));
+    }
+    return sum(x);
+  };
+
+  auto grad_fn = grad(fn);
+  auto independent_value_and_grad = [&]() {
+    auto value = fn(x);
+    auto dfdx = grad_fn(x);
+    return std::vector<array>{value, dfdx};
+  };
+  TIME(independent_value_and_grad);
+
+  auto value_and_grad_fn = value_and_grad(fn);
+  auto combined_value_and_grad = [&]() {
+    auto [value, dfdx] = value_and_grad_fn(x);
+    return std::vector<array>{value, dfdx};
+  };
+  TIME(combined_value_and_grad);
+}
+
+int main() {
+  std::cout << "Benchmarks for " << default_device() << std::endl;
+  time_value_and_grad();
+}
--- a/benchmarks/cpp/compare_devices.cpp
+++ b/benchmarks/cpp/compare_devices.cpp
@ -0,0 +1,25 @@
+#include <iostream>
+#include "mlx/mlx.h"
+#include "time_utils.h"
+
+using namespace mlx::core;
+
+void time_add_op() {
+  std::vector<int> sizes(1, 1);
+  for (int i = 0; i < 9; ++i) {
+    sizes.push_back(10 * sizes.back());
+  }
+  set_default_device(Device::cpu);
+  for (auto size : sizes) {
+    auto a = random::uniform({size});
+    auto b = random::uniform({size});
+    eval(a, b);
+    std::cout << "Size " << size << std::endl;
+    TIMEM("cpu", add, a, b, Device::cpu);
+    TIMEM("gpu", add, a, b, Device::gpu);
+  }
+}
+
+int main() {
+  time_add_op();
+}
--- a/benchmarks/numpy/single_ops.py
+++ b/benchmarks/numpy/single_ops.py
@ -0,0 +1,38 @@
+import numpy as np
+
+from time_utils import time_fn
+
+
+def time_add():
+    a = np.ones((100, 100, 10), dtype=np.float32)
+    b = np.ones((100, 100, 10), dtype=np.float32)
+    time_fn(np.add, a, b)
+
+
+def time_matmul():
+    a = np.random.rand(1000, 500).astype(np.float32)
+    b = np.random.rand(500, 1000).astype(np.float32)
+    time_fn(np.matmul, a, b)
+
+
+def time_exp():
+    a = np.random.randn(1000, 100).astype(np.float32)
+    time_fn(np.exp, a)
+
+
+def time_take():
+    a = np.random.rand(10000, 500)
+    ids = np.random.randint(0, 10000, (20, 10))
+    ids = [idx.reshape(-1) for idx in np.split(ids, 20)]
+
+    def random_take():
+        return [np.take(a, idx, 0) for idx in ids]
+
+    time_fn(random_take)
+
+
+if __name__ == "__main__":
+    time_add()
+    time_matmul()
+    time_exp()
+    time_take()
--- a/benchmarks/numpy/time_utils.py
+++ b/benchmarks/numpy/time_utils.py
@ -0,0 +1,18 @@
+import time
+
+
+def time_fn(fn, *args):
+    print(f"Timing {fn.__name__} ...", end=" ")
+
+    # warmup
+    for _ in range(5):
+        fn(*args)
+
+    num_iters = 100
+    tic = time.perf_counter()
+    for _ in range(num_iters):
+        x = fn(*args)
+    toc = time.perf_counter()
+
+    msec = 1e3 * (toc - tic) / num_iters
+    print(f"{msec:.5f} msec")
--- a/benchmarks/python/blas/bench_gemm.py
+++ b/benchmarks/python/blas/bench_gemm.py
@ -0,0 +1,190 @@
+import numpy as np
+import argparse
+import mlx.core as mx
+import time
+import torch
+import os
+import math
+import subprocess
+
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 8
+N_iter_bench = 80
+N_iter_func = 5
+
+
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def gemm_nn_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+def gemm_nt_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b.transpose((0, 2, 1))
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+def gemm_tn_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose((0, 2, 1)) @ b
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+def gemm_tt_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose((0, 2, 1)) @ b.transpose((0, 2, 1))
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+@torch.no_grad()
+def gemm_nn_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+@torch.no_grad()
+def gemm_nt_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b.transpose(-1, -2)
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+@torch.no_grad()
+def gemm_tn_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose(-1, -2) @ b
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+@torch.no_grad()
+def gemm_tt_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose(-1, -2) @ b.transpose(-1, -2)
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
+    shape_a = (B, M, K) if transpose[0] == "n" else (B, K, M)
+    shape_b = (B, K, N) if transpose[1] == "n" else (B, N, K)
+
+    a_np = np.random.normal(0.0, 1.0 / math.sqrt(M + K), shape_a).astype(np_dtype)
+    b_np = np.random.normal(0.0, 1.0 / math.sqrt(N + K), shape_b).astype(np_dtype)
+
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+
+    a_pt = torch.from_numpy(a_np).to("mps")
+    b_pt = torch.from_numpy(b_np).to("mps")
+
+    torch.mps.synchronize()
+
+    f_mx = {
+        "nn": gemm_nn_mlx,
+        "nt": gemm_nt_mlx,
+        "tn": gemm_tn_mlx,
+        "tt": gemm_tt_mlx,
+    }[transpose]
+
+    f_pt = {
+        "nn": gemm_nn_torch,
+        "nt": gemm_nt_torch,
+        "tn": gemm_tn_torch,
+        "tt": gemm_tt_torch,
+    }[transpose]
+
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+
+    t_a = (0, 1, 2) if transpose[0] == "n" else (0, 2, 1)
+    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)
+
+    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
+    c_npy = a_np.transpose(t_a).astype(np.float32) @ b_np.transpose(t_b).astype(
+        np.float32
+    )
+
+    atol = 1e-5 if np_dtype == np.float32 else 1e-4
+
+    if not np.allclose(c_mlx, c_npy.astype(np_dtype), atol=atol):
+        print(
+            f"Failed at {(B, M, N, K)} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+def get_gflop_count(B, M, N, K):
+    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run gemm benchmarks")
+
+    dtypes = ("float32", "float16")
+    transposes = ("nn", "nt", "tn")
+    shapes = (
+        (16, 1024, 1024, 1024),
+        (1, 1024, 1024, 2048),
+        (4, 1024, 1024, 4096),
+        (4, 1024, 4096, 1024),
+        (1, 4096, 4096, 4096),
+        (15, 1023, 1023, 1023),
+        (17, 1025, 1025, 1025),
+    )
+
+    for dtype in dtypes:
+        for transpose in transposes:
+            for B, M, N, K in shapes:
+                np_dtype = getattr(np, dtype)
+                time_mlx, time_torch = bench_shape(B, M, N, K, np_dtype, transpose)
+
+                gflop_count = get_gflop_count(B, M, N, K)
+                gflops_mx = gflop_count / (time_mlx)
+                gflops_pt = gflop_count / (time_torch)
+                diff = gflops_mx / gflops_pt - 1.0
+
+                print(
+                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100. * diff:+5.2f}%"
+                )
+                if gflops_pt >= 2.0 * gflops_mx:
+                    print("ATTENTION ^^^^^^^")
--- a/benchmarks/python/blas/bench_gemv.py
+++ b/benchmarks/python/blas/bench_gemv.py
@ -0,0 +1,219 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import argparse
+import mlx.core as mx
+import time
+import torch
+import os
+import subprocess
+
+
+results_dir = "./results"
+
+if not os.path.isdir(results_dir):
+    os.mkdir(results_dir)
+
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+
+N_warmup = 5
+N_iter_bench = 50
+N_iter_func = 20
+
+out_vec_sizes = [128, 512, 2048, 4096]
+in_vec_sizes = [128, 512, 2048, 4096]
+
+benchmark_vector_lens = []
+benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2]
+benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2]
+benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2]
+benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000]
+
+benchmark_vector_lens.sort()
+
+
+def bench(f, m, v):
+    for i in range(N_warmup):
+        f(m, v)
+    torch.mps.synchronize()
+
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(m, v)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+
+
+def gemv_mlx(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = m @ v
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+def gemv_t_mlx(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = v @ m
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+
+
+@torch.no_grad()
+def gemv_torch(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = m @ v
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+@torch.no_grad()
+def gemv_t_torch(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = v @ m
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+
+
+def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False):
+    shape_mat = (in_vec_len, out_vec_len) if transpose else (out_vec_len, in_vec_len)
+    shape_vec = (1, in_vec_len) if transpose else (in_vec_len, 1)
+
+    mat_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_mat).astype(np_dtype)
+    vec_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_vec).astype(np_dtype)
+    mat_mlx = mx.array(mat_npy)
+    vec_mlx = mx.array(vec_npy)
+    mat_trc = torch.from_numpy(mat_npy).to("mps")
+    vec_trc = torch.from_numpy(vec_npy).to("mps")
+
+    torch.mps.synchronize()
+
+    time_torch = (
+        bench(gemv_t_torch, mat_trc, vec_trc)
+        if transpose
+        else bench(gemv_torch, mat_trc, vec_trc)
+    )
+    time_mlx = (
+        bench(gemv_t_mlx, mat_mlx, vec_mlx)
+        if transpose
+        else bench(gemv_mlx, mat_mlx, vec_mlx)
+    )
+
+    c_mlx = (
+        np.asarray(vec_mlx @ mat_mlx) if transpose else np.asarray(mat_mlx @ vec_mlx)
+    )
+    c_npy = (vec_npy @ mat_npy) if transpose else (mat_npy @ vec_npy)
+
+    if not np.allclose(c_mlx, c_npy, atol=2e-5):
+        print(
+            f"Failed at {shape_mat} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
+        )
+
+    return time_mlx, time_torch
+
+
+def get_gflop_count(in_vec_len, out_vec_len):
+    return float(2.0 * N_iter_bench * N_iter_func * in_vec_len * out_vec_len) / float(
+        1024**3
+    )
+
+
+def get_gbyte_size(in_vec_len, out_vec_len, np_dtype):
+    n_elem = in_vec_len * out_vec_len + in_vec_len + out_vec_len
+    item_size = 4 if np_dtype == np.float32 else 2
+    return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3)
+
+
+def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, tranpose):
+    np_dtype = getattr(np, dtype)
+    mlx_gb_s = []
+    mlx_gflops = []
+    pyt_gb_s = []
+    pyt_gflops = []
+
+    for out_vec_len in out_vector_lens:
+        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
+        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)
+
+        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)
+
+        mlx_gb_s.append(gbyte_size / time_mlx)
+        pyt_gb_s.append(gbyte_size / time_torch)
+
+        mlx_gflops.append(gflop_count / time_mlx)
+        pyt_gflops.append(gflop_count / time_torch)
+
+    if transpose:
+        title = f"gemv_t ([1, {in_vec_len}] [{in_vec_len}, out_vec_len]) | {dtype}"
+    else:
+        title = f"gemv ([out_vec_len, {in_vec_len}] X [{in_vec_len}, 1] ) | {dtype}"
+
+    ax.plot(out_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
+    ax.plot(out_vector_lens, pyt_gb_s, "tab:red", label="Torch")
+    ax.set_title(title)
+    ax.set(xlabel="out_vector_len", ylabel="Performance (GB/s)")
+    ax.legend()
+
+
+def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, tranpose):
+    np_dtype = getattr(np, dtype)
+    mlx_gb_s = []
+    mlx_gflops = []
+    pyt_gb_s = []
+    pyt_gflops = []
+
+    for in_vec_len in in_vector_lens:
+        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
+        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)
+
+        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)
+
+        mlx_gb_s.append(gbyte_size / time_mlx)
+        pyt_gb_s.append(gbyte_size / time_torch)
+
+        mlx_gflops.append(gflop_count / time_mlx)
+        pyt_gflops.append(gflop_count / time_torch)
+
+    if transpose:
+        title = f"([1, in_vec_len] [in_vec_len, {out_vec_len}])"
+    else:
+        title = f"([{out_vec_len}, in_vec_len] X [in_vec_len, 1] )"
+
+    ax.plot(in_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
+    ax.plot(in_vector_lens, pyt_gb_s, "tab:red", label="Torch")
+    ax.set_title(title)
+    ax.set(xlabel="in_vector_len", ylabel="Performance (GB/s)")
+    ax.legend()
+
+
+for transpose in (False, True):
+    for dtype in ("float32", "float16"):
+        fig, axs = plt.subplots(
+            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
+        )
+
+        for i, in_vec_len in enumerate(in_vec_sizes):
+            bench_with_in_len(
+                axs[i][0], in_vec_len, benchmark_vector_lens, dtype, transpose
+            )
+
+        for i, out_vec_len in enumerate(out_vec_sizes):
+            bench_with_out_len(
+                axs[i][1], out_vec_len, benchmark_vector_lens, dtype, transpose
+            )
+
+        op_name = "gemv_t" if transpose else "gemv"
+        fig.suptitle(f"{device_name}: {dtype} {op_name}")
+        fig.savefig(
+            os.path.join(
+                results_dir, f'{device_name.replace(" ", "_")}_{dtype}_{op_name}.pdf'
+            )
+        )
+        plt.close(fig)
--- a/benchmarks/python/llama_mlx_bench.py
+++ b/benchmarks/python/llama_mlx_bench.py
@ -0,0 +1,116 @@
+import math
+import time
+
+import mlx.core as mx
+import mlx.nn as nn
+import mlx.utils
+
+
+class LlamaAttention(nn.Module):
+    def __init__(self, dims: int, num_heads: int):
+        super().__init__()
+        self.num_heads = num_heads
+        self.rope = nn.RoPE(dims // num_heads, True)
+        self.query_proj = nn.Linear(dims, dims, False)
+        self.key_proj = nn.Linear(dims, dims, False)
+        self.value_proj = nn.Linear(dims, dims, False)
+        self.out_proj = nn.Linear(dims, dims, False)
+
+    def __call__(self, queries, keys, values, mask=None, cache=None):
+        queries = self.query_proj(queries)
+        keys = self.key_proj(keys)
+        values = self.value_proj(values)
+
+        num_heads = self.num_heads
+        B, L, D = queries.shape
+        queries = mx.transpose(mx.reshape(queries, (B, L, num_heads, -1)), (0, 2, 1, 3))
+        keys = mx.transpose(mx.reshape(keys, (B, L, num_heads, -1)), (0, 2, 1, 3))
+        values = mx.transpose(mx.reshape(values, (B, L, num_heads, -1)), (0, 2, 1, 3))
+
+        if cache is not None:
+            key_cache, value_cache = cache
+            queries = self.rope(queries, offset=key_cache.shape[2])
+            keys = self.rope(keys, offset=key_cache.shape[2])
+            keys = mx.concatenate([key_cache, keys], axis=2)
+            values = mx.concatenate([value_cache, values], axis=2)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        # Dimensions are [batch x num heads x sequence x hidden dim]
+        scale = mx.array(math.sqrt(1 / queries.shape[-1]), dtype=queries.dtype)
+        scores = (queries * scale) @ mx.transpose(keys, (0, 1, 3, 2))
+        if mask is not None:
+            scores = scores + mask
+        scores = mx.softmax(scores, axis=-1)
+        values_hat = mx.reshape(mx.transpose(scores @ values, (0, 2, 1, 3)), (B, L, -1))
+
+        return self.out_proj(values_hat), (keys, values)
+
+
+class LlamaEncoderLayer(nn.Module):
+    def __init__(self, dims: int, mlp_dims: int, num_heads: int):
+        super().__init__()
+
+        self.attention = LlamaAttention(dims, num_heads)
+
+        self.norm1 = nn.RMSNorm(dims)
+        self.norm2 = nn.RMSNorm(dims)
+
+        self.linear1 = nn.Linear(dims, mlp_dims, False)
+        self.linear2 = nn.Linear(dims, mlp_dims, False)
+        self.linear3 = nn.Linear(mlp_dims, dims, False)
+
+    def __call__(self, x, mask=None, cache=None):
+        y = self.norm1(x)
+        y, cache = self.attention(y, y, y, mask, cache)
+        x = x + y
+
+        y = self.norm2(x)
+        a = self.linear1(y)
+        b = self.linear2(y)
+        y = a * mx.sigmoid(a) * b
+        y = self.linear3(y)
+        x = x + y
+
+        return x, cache
+
+
+def measure(model, x, cache):
+    for i in range(5):
+        y, c = model(x, mask=None, cache=cache)
+        mx.eval(y, c)
+
+    start = time.time()
+    rs = []
+    for i in range(5):
+        y, c = model(x, mask=None, cache=cache)
+        rs.append((y, c))
+    mx.eval(rs)
+    end = time.time()
+
+    return (end - start) * 1000 / 5
+
+
+if __name__ == "__main__":
+    H = 32
+    D = 4096
+    F = 43 * 256
+    C = 1000
+    mx.set_default_device(mx.gpu)
+    dtype = mx.float16
+
+    layer = LlamaEncoderLayer(D, F, H)
+    layer.update(mlx.utils.tree_map(lambda x: x.astype(dtype), layer.parameters()))
+    k1, k2, k3 = mx.random.split(mx.random.key(0), 3)
+    x = mx.random.normal([1, 1, D], dtype=dtype)
+    cache = [
+        mx.random.normal([1, H, C, D // H], dtype=dtype),
+        mx.random.normal([1, H, C, D // H], dtype=dtype),
+    ]
+    mx.eval(x, cache)
+
+    T = measure(layer, x, cache)
+
+    print("Time per layer per token:", T, "ms")
+    print("Lower bound total time per token:", T * 32, "ms")
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@ -0,0 +1,56 @@
+include(CMakeParseArguments)
+
+###############################################################################
+# Build metal library
+#
+# Adds a custom target ${TARGET} to build ${OUTPUT_DIRECTORY}/{TITLE}.metallib
+# from list ${SOURCES}, including list ${INCLUDE_DIRS}, depends on list ${DEPS}
+#
+# Args:
+#     TARGET: Custom target to be added for the metal library 
+#     TITLE: Name of the .metallib
+#     OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib
+#     SOURCES: List of source files
+#     INCLUDE_DIRS: List of include dirs
+#     DEPS: List of depedency files (like headers)
+#
+macro(mlx_build_metallib)
+  # Parse args
+  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY)
+  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
+  cmake_parse_arguments(
+      MTLLIB 
+      ""
+      "${oneValueArgs}"
+      "${multiValueArgs}" 
+      ${ARGN}
+  )
+
+  # Set output
+  set(MTLLIB_BUILD_TARGET "${MTLLIB_OUTPUT_DIRECTORY}/${MTLLIB_TITLE}.metallib")
+
+  # Collect compile options 
+  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math)
+
+  # Prepare metllib build command
+  add_custom_command(
+    OUTPUT ${MTLLIB_BUILD_TARGET}
+    COMMAND xcrun -sdk macosx metal 
+                  "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
+                  ${MTLLIB_COMPILE_OPTIONS}
+                  ${MTLLIB_SOURCES}
+                  -o ${MTLLIB_BUILD_TARGET}
+    DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
+    COMMAND_EXPAND_LISTS
+    COMMENT "Building ${MTLLIB_TITLE}.metallib"
+    VERBATIM
+  )
+
+  # Add metallib custom target
+  add_custom_target(
+    ${MTLLIB_TARGET}
+    DEPENDS
+    ${MTLLIB_BUILD_TARGET}
+  )
+
+endmacro(mlx_build_metallib)
--- a/docs/.nojekyll
+++ b/docs/.nojekyll
--- a/docs/index.html
+++ b/docs/index.html
@ -0,0 +1 @@
+<meta http-equiv="refresh" content="0; url=./build/html/index.html" />
--- a/docs/src/_templates/nn-module-template.rst
+++ b/docs/src/_templates/nn-module-template.rst
@ -0,0 +1,19 @@
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {#{% block methods %}
+
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   .. autosummary::
+   {% for item in methods %}
+      {%- if item not in inherited_members and item != '__init__' %}
+         ~{{ name }}.{{ item }}
+      {%- endif %}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}#}
--- a/docs/src/cpp/ops.rst
+++ b/docs/src/cpp/ops.rst
@ -0,0 +1,6 @@
+.. _cpp_ops:
+
+Operations
+==========
+
+
--- a/docs/src/dev/extensions.rst
+++ b/docs/src/dev/extensions.rst
@ -0,0 +1,948 @@
+Developer Documentation
+=======================
+
+MLX provides a open and flexible backend to which users may add operations 
+and specialized implementations without much hassle. While the library supplies
+efficient operations that can be used and composed for any number of 
+applications, there may arise cases where new functionalities or highly 
+optimized implementations are needed. For such cases, you may design and 
+implement your own operations that link to and build on top of :mod:`mlx.core`.
+We will introduce the inner-workings of MLX and go over a simple example to 
+learn the steps involved in adding new operations to MLX with your own CPU 
+and GPU implementations. 
+
+Introducing the Example
+-----------------------
+
+Let's say that you would like an operation that takes in two arrays, 
+``x`` and ``y``, scales them both by some coefficents ``alpha`` and ``beta``
+respectively, and then adds them together to get the result 
+``z = alpha * x + beta * y``. Well, you can very easily do that by just 
+writing out a function as follows:
+
+.. code-block:: python
+
+    import mlx.core as mx
+
+    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
+        return alpha * x + beta * y
+
+This function performs that operation while leaving the implementations and 
+differentiation to MLX. 
+
+However, you work with vector math libraries often and realize that the 
+``axpby`` routine defines the same operation ``Y = (alpha * X) + (beta * Y)``. 
+You would really like the part of your applications that does this operation 
+on the CPU to be very fast - so you decide that you want it to rely on the 
+``axpby`` routine provided by the Accelerate_ framework. Continuing to impose 
+our assumptions on to you, let's also assume that you want to learn how add 
+your own implementation for the gradients of your new operation while going 
+over the ins-and-outs of the MLX framework. 
+
+Well, what a coincidence! You are in the right place. Over the course of this 
+example, we will learn:
+
+* The structure of the MLX library from the frontend API to the backend implementations.
+* How to implement your own CPU backend that redirects to Accelerate_ when appropriate (and a fallback if needed).
+* How to implement your own GPU implementation using metal.
+* How to add your own ``vjp`` and ``jvp``.
+* How to build your implementations, link them to MLX, and bind them to python.
+
+Operations and Primitives
+-------------------------
+
+In one sentence, operations in MLX build the computation graph, and primitives 
+provide the rules for evaluation and transformations of said graph. Let's start 
+by discussing operations in more detail. 
+
+Operations
+^^^^^^^^^^^
+
+Operations are the frontend functions that operate on arrays. They are defined 
+in the C++ API (:ref:`cpp_ops`) and then we provide bindings to these 
+operations in the Python API (:ref:`ops`). 
+
+We would like an operation, :meth:`axpby` that takes in two arrays ``x`` and ``y``,
+and two scalars, ``alpha`` and ``beta``. This is how we would define it in the 
+C++ API:
+
+.. code-block:: C++
+
+    /**
+    *  Scale and sum two vectors elementwise
+    *  z = alpha * x + beta * y
+    *
+    *  Follow numpy style broadcasting between x and y
+    *  Inputs are upcasted to floats if needed
+    **/
+    array axpby(
+        const array& x, // Input array x
+        const array& y, // Input array y
+        const float alpha, // Scaling factor for x
+        const float beta, // Scaling factor for y
+        StreamOrDevice s = {} // Stream on which to schedule the operation
+    );
+
+
+This operation itself can call other operations within it if needed. So, the 
+simplest way to go about implementing this operation would be do so in terms 
+of existing operations. 
+
+.. code-block:: C++
+
+    array axpby(
+        const array& x, // Input array x
+        const array& y, // Input array y
+        const float alpha, // Scaling factor for x
+        const float beta, // Scaling factor for y
+        StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
+    ) {
+        // Scale x and y on the provided stream
+        auto ax = multiply(array(alpha), x, s);
+        auto by = multiply(array(beta), y, s);
+        
+        // Add and return
+        return add(ax, by, s);
+    }
+
+However, as we discussed earlier, this is not our goal. The operations themselves 
+do not contain the implementations that act on the data, nor do they contain the
+rules of transformations. Rather, they are an easy to use interface that build 
+on top of the building blocks we call :class:`Primitive`. 
+
+Primitives
+^^^^^^^^^^^
+
+A :class:`Primitive` is part of the computation graph of an :class:`array`. It 
+defines how to create an output given a set of input :class:`array` . Further,
+a :class:`Primitive` is a class that contains rules on how it is evaluated 
+on the CPU or GPU, and how it acts under transformations such as ``vjp`` and 
+``jvp``. These words on their own can be a bit abstract, so lets take a step 
+back and go to our example to give ourselves a more concrete image. 
+
+.. code-block:: C++
+
+    class Axpby : public Primitive {
+      public:
+        explicit Axpby(Stream stream, float alpha, float beta)
+            : Primitive(stream), alpha_(alpha), beta_(beta){};
+
+        /**
+        * A primitive must know how to evaluate itself on the CPU/GPU
+        * for the given inputs and populate the output array.
+        *
+        * To avoid unecessary allocations, the evaluation function
+        * is responsible for allocating space for the array.
+        */
+        void eval_cpu(const std::vector<array>& inputs, array& out) override;
+        void eval_gpu(const std::vector<array>& inputs, array& out) override;
+
+        /** The Jacobian-vector product. */
+        array jvp(
+            const std::vector<array>& primals,
+            const std::vector<array>& tangents,
+            const std::vector<int>& argnums) override;
+
+        /** The vector-Jacobian product. */
+        std::vector<array> vjp(
+            const std::vector<array>& primals,
+            const array& cotan,
+            const std::vector<int>& argnums) override;
+
+        /**
+        * The primitive must know how to vectorize itself accross
+        * the given axes. The output is a pair containing the array
+        * representing the vectorized computation and the axis which
+        * corresponds to the output vectorized dimension.
+        */
+        std::pair<array, int> vmap(
+            const std::vector<array>& inputs,
+            const std::vector<int>& axes) override;
+
+        /** Print the primitive. */
+        void print(std::ostream& os) override {
+            os << "Axpby";
+        }
+
+        /** Equivalence check **/
+        bool is_equivalent(const Primitive& other) const override;
+
+      private:
+        float alpha_;
+        float beta_;
+
+        /** Fall back implementation for evaluation on CPU */
+        void eval(const std::vector<array>& inputs, array& out);
+    };
+
+The :class:`Axpby` class derives from the base :class:`Primitive` class and 
+follows the above demonstrated interface. :class:`Axpby` treats ``alpha`` and 
+``beta`` as parameters. It then provides implementations of how the array ``out`` 
+is produced given ``inputs`` through :meth:`Axpby::eval_cpu` and 
+:meth:`Axpby::eval_gpu`. Further, it provides rules of transformations in 
+:meth:`Axpby::jvp`, :meth:`Axpby::vjp`, and :meth:`Axpby::vmap`. 
+
+Using the Primitives
+^^^^^^^^^^^^^^^^^^^^^
+
+Operations can use this :class:`Primitive` to add a new :class:`array` to 
+the computation graph. An :class:`array` can be constructed by providing its 
+data type, shape, the :class:`Primitive` that computes it, and the 
+:class:`array` inputs that are passed to the primitive.
+
+Let's re-implement our operation now in terms of our :class:`Axpby` primitive.
+
+.. code-block:: C++
+
+    array axpby(
+        const array& x, // Input array x
+        const array& y, // Input array y
+        const float alpha, // Scaling factor for x
+        const float beta, // Scaling factor for y
+        StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
+    ) {
+        // Promote dtypes between x and y as needed
+        auto promoted_dtype = promote_types(x.dtype(), y.dtype());
+
+        // Upcast to float32 for non-floating point inputs x and y
+        auto out_dtype = is_floating_point(promoted_dtype)
+            ? promoted_dtype
+            : promote_types(promoted_dtype, float32);
+
+        // Cast x and y up to the determined dtype (on the same stream s)
+        auto x_casted = astype(x, out_dtype, s);
+        auto y_casted = astype(y, out_dtype, s);
+
+        // Broadcast the shapes of x and y (on the same stream s)
+        auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
+        auto out_shape = broadcasted_inputs[0].shape();
+
+        // Construct the array as the output of the Axpby primitive
+        // with the broadcasted and upcasted arrays as inputs
+        return array(
+            /* const std::vector<int>& shape = */ out_shape,
+            /* Dtype dtype = */ out_dtype,
+            /* std::unique_ptr<Primitive> primitive = */
+            std::make_unique<Axpby>(to_stream(s), alpha, beta),
+            /* const std::vector<array>& inputs = */ broadcasted_inputs);
+    }
+
+
+This operation now handles the following:
+
+#. Upcast inputs and resolve the the output data type.
+#. Broadcast the inputs and resolve the output shape.
+#. Construct the primitive :class:`Axpby` using the given stream, ``alpha``, and ``beta``.
+#. Construct the output :class:`array` using the primitive and the inputs.
+
+Implementing the Primitive
+--------------------------
+
+No computation happens when we call the operation alone. In effect, the 
+operation only builds the computation graph. When we evaluate the output 
+array, MLX schedules the execution of the computation graph, and calls
+:meth:`Axpby::eval_cpu` or :meth:`Axpby::eval_gpu` depending on the 
+stream/device specified by the user. 
+
+.. warning::
+    When :meth:`Primitive::eval_cpu` or :meth:`Primitive::eval_gpu` are called,
+    no memory has been allocated for the output array. It falls on the implementation
+    of these functions to allocate memory as needed
+
+Implementing the CPU Backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's start by trying to implement a naive and generic version of 
+:meth:`Axpby::eval_cpu`. We declared this as a private member function of 
+:class:`Axpby` earlier called :meth:`Axpby::eval`. 
+
+Our naive method will go over each element of the output array, find the 
+corresponding input elements of ``x`` and ``y`` and perform the operation 
+pointwise. This is captured in the templated function :meth:`axpby_impl`. 
+
+.. code-block:: C++
+
+    template <typename T>
+    void axpby_impl(
+            const array& x,
+            const array& y,
+            array& out,
+            float alpha_,
+            float beta_) {
+        // We only allocate memory when we are ready to fill the output
+        // malloc_or_wait synchronously allocates available memory
+        // There may be a wait executed here if the allocation is requested
+        // under memory-pressured conditions
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+        // Collect input and output data pointers
+        const T* x_ptr = x.data<T>();
+        const T* y_ptr = y.data<T>();
+        T* out_ptr = out.data<T>();
+
+        // Cast alpha and beta to the relevant types
+        T alpha = static_cast<T>(alpha_);
+        T beta = static_cast<T>(beta_);
+
+        // Do the elementwise operation for each output
+        for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
+            // Map linear indices to offsets in x and y
+            auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
+            auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
+
+            // We allocate the output to be contiguous and regularly strided
+            // (defaults to row major) and hence it doesn't need additonal mapping
+            out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
+        }
+    }
+
+Now, we would like our implementation to be able to do this pointwise operation 
+for all incoming floating point arrays. Accordingly, we add dispatches for 
+``float32``, ``float16``, ``bfloat16`` and ``complex64``. We throw an error 
+if we encounter an unexpected type.
+
+.. code-block:: C++
+
+    /** Fall back implementation for evaluation on CPU */
+    void Axpby::eval(const std::vector<array>& inputs, array& out) {
+        // Check the inputs (registered in the op while contructing the out array)
+        assert(inputs.size() == 2);
+        auto& x = inputs[0];
+        auto& y = inputs[1];
+
+        // Dispatch to the correct dtype
+        if (out.dtype() == float32) {
+            return axpby_impl<float>(x, y, out, alpha_, beta_);
+        } else if (out.dtype() == float16) {
+            return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
+        } else if (out.dtype() == bfloat16) {
+            return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
+        } else if (out.dtype() == complex64) {
+            return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
+        } else {
+            throw std::runtime_error(
+                "Axpby is only supported for floating point types.");
+        }
+    }
+
+We have a fallback implementation! Now, to do what we are really here to do. 
+Remember we wanted to use the ``axpby`` routine provided by the Accelerate_
+framework? Well, there are 3 complications to keep in mind:
+
+#.  Accelerate does not provide implementations of ``axpby`` for half precision
+    floats. We can only direct to it for ``float32`` types 
+#.  Accelerate assumes the inputs ``x`` and ``y`` are contiguous and all elements
+    have fixed strides between them. Possibly due to broadcasts and transposes, 
+    we aren't guaranteed that the inputs fit this requirement. We can 
+    only direct to Accelerate if both ``x`` and ``y`` are row contiguous or 
+    column contiguous. 
+#.  Accelerate performs the routine ``Y = (alpha * X) + (beta * Y)`` inplace. 
+    MLX expects to write out the answer to a new array. We must copy the elements 
+    of ``y`` into the output array and use that as an input to ``axpby``
+
+Let's write out an implementation that uses Accelerate in the right conditions. 
+It must simply allocate data for the output, copy elements of ``y`` into it, 
+and then call the :meth:`catlas_saxpby` from accelerate. 
+
+.. code-block:: C++
+
+    template <typename T>
+    void axpby_impl_accelerate(
+            const array& x,
+            const array& y,
+            array& out,
+            float alpha_,
+            float beta_) {
+        // Accelerate library provides catlas_saxpby which does
+        // Y = (alpha * X) + (beta * Y) in place
+        // To use it, we first copy the data in y over to the output array
+
+        // This specialization requires both x and y be contiguous in the same mode
+        // i.e: corresponding linear indices in both point to corresponding elements
+        // The data in the output array is allocated to match the strides in y
+        // such that x, y, and out are contiguous in the same mode and
+        // no transposition is needed
+        out.set_data(
+            allocator::malloc_or_wait(y.data_size() * out.itemsize()),
+            y.data_size(),
+            y.strides(),
+            y.flags());
+
+        // We then copy over the elements using the contiguous vector specialization
+        copy_inplace(y, out, CopyType::Vector);
+
+        // Get x and y pointers for catlas_saxpby
+        const T* x_ptr = x.data<T>();
+        T* y_ptr = out.data<T>();
+
+        T alpha = static_cast<T>(alpha_);
+        T beta = static_cast<T>(beta_);
+
+        // Call the inplace accelerate operator
+        catlas_saxpby(
+            /* N = */ out.size(),
+            /* ALPHA = */ alpha,
+            /* X = */ x_ptr,
+            /* INCX = */ 1,
+            /* BETA = */ beta,
+            /* Y = */ y_ptr,
+            /* INCY = */ 1);
+    }
+
+Great! But what about the inputs that do not fit the criteria for accelerate?
+Luckily, we can always just direct back to :meth:`Axpby::eval`.
+
+With this in mind, lets finally implement our :meth:`Axpby::eval_cpu`.
+
+.. code-block:: C++
+
+    /** Evaluate primitive on CPU using accelerate specializations */
+    void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+        assert(inputs.size() == 2);
+        auto& x = inputs[0];
+        auto& y = inputs[1];
+
+        // Accelerate specialization for contiguous single precision float arrays
+        if (out.dtype() == float32 &&
+            ((x.flags().row_contiguous && y.flags().row_contiguous) ||
+            (x.flags().col_contiguous && y.flags().col_contiguous))) {
+            axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
+            return;
+        }
+
+        // Fall back to common backend if specializations are not available
+        eval(inputs, out);
+    }
+
+We have now hit a milestone! Just this much is enough to run the operation 
+:meth:`axpby` on a CPU stream! 
+
+If you do not plan on running the operation on the GPU or using transforms on 
+computation graphs that contain :class:`Axpby`, you can stop implementing the 
+primitive here and enjoy the speed-ups you get from the Accelerate library. 
+
+Implementing the GPU Backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Apple silicon devices address their GPUs using the Metal_ shading language, and 
+all GPU kernels in MLX are written using metal. 
+
+.. note::
+
+    Here are some helpful resources if you are new to metal!
+
+    * A walkthrough of the metal compute pipeline: `Metal Example`_
+    * Documentation for metal shading language: `Metal Specification`_
+    * Using metal from C++: `Metal-cpp`_
+
+Let's keep the GPU algorithm simple. We will launch exactly as many threads 
+as there are elements in the output. Each thread will pick the element it needs 
+from ``x`` and ``y``, do the pointwise operation, and then update its assigned 
+element in the output. 
+
+.. code-block:: C++
+
+    template <typename T>
+    [[kernel]] void axpby_general(
+            device const T* x [[buffer(0)]],
+            device const T* y [[buffer(1)]],
+            device T* out [[buffer(2)]],
+            constant const float& alpha [[buffer(3)]],
+            constant const float& beta [[buffer(4)]],
+            constant const int* shape [[buffer(5)]],
+            constant const size_t* x_strides [[buffer(6)]],
+            constant const size_t* y_strides [[buffer(7)]],
+            constant const int& ndim [[buffer(8)]],
+            uint index [[thread_position_in_grid]]) {
+        // Convert linear indices to offsets in array
+        auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
+        auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
+        
+        // Do the operation and update the output
+        out[index] = 
+            static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
+    }
+
+We then need to instantiate this template for all floating point types and give
+each instantiation a unique host name so we can identify the right kernel for 
+each data type. 
+
+.. code-block:: C++
+
+    #define instantiate_axpby(type_name, type)              \
+        template [[host_name("axpby_general_" #type_name)]] \
+        [[kernel]] void axpby_general<type>(                \
+            device const type* x [[buffer(0)]],             \
+            device const type* y [[buffer(1)]],             \
+            device type* out [[buffer(2)]],                 \
+            constant const float& alpha [[buffer(3)]],      \
+            constant const float& beta [[buffer(4)]],       \
+            constant const int* shape [[buffer(5)]],        \
+            constant const size_t* x_strides [[buffer(6)]], \
+            constant const size_t* y_strides [[buffer(7)]], \
+            constant const int& ndim [[buffer(8)]],         \
+            uint index [[thread_position_in_grid]]);
+
+    instantiate_axpby(float32, float);
+    instantiate_axpby(float16, half);
+    instantiate_axpby(bflot16, bfloat16_t);
+    instantiate_axpby(complex64, complex64_t);
+
+This kernel will be compiled into a metal library ``mlx_ext.metallib`` as we 
+will see later in :ref:`Building with CMake`. In the following example, we 
+assume that the library ``mlx_ext.metallib`` will always be co-located with 
+the executable/ shared-library calling the :meth:`register_library` function. 
+The :meth:`register_library` function takes the library's name and potential 
+path (or in this case, a function that can produce the path of the metal 
+library) and tries to load that library if it hasn't already been registered 
+by the relevant static :class:`mlx::core::metal::Device` object. This is why, 
+it is important to package your C++ library with the metal library. We will 
+go over this process in more detail later. 
+
+The logic to determine the kernel, set the inputs, resolve the grid dimensions 
+and dispatch it to the GPU are contained in :meth:`Axpby::eval_gpu` as shown 
+below.
+
+.. code-block:: C++
+
+    /** Evaluate primitive on GPU */
+    void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+        // Prepare inputs
+        assert(inputs.size() == 2);
+        auto& x = inputs[0];
+        auto& y = inputs[1];
+
+        // Each primitive carries the stream it should execute on
+        // and each stream carries its device identifiers
+        auto& s = stream();
+        // We get the needed metal device using the stream
+        auto& d = metal::device(s.device);
+
+        // Allocate output memory 
+        out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+        // Resolve name of kernel (corresponds to axpby.metal)
+        std::ostringstream kname;
+        kname << "axpby_" << "general_" << type_to_name(out);
+
+        // Make sure the metal library is available and look for it
+        // in the same folder as this executable if needed
+        d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
+
+        // Make a kernel from this metal library
+        auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+
+        // Prepare to encode kernel
+        auto compute_encoder = d.get_command_encoder(s.index);
+        compute_encoder->setComputePipelineState(kernel);
+
+        // Kernel parameters are registered with buffer indices corresponding to
+        // those in the kernel decelaration at axpby.metal
+        int ndim = out.ndim();
+        size_t nelem = out.size();
+
+        // Encode input arrays to kernel
+        set_array_buffer(compute_encoder, x, 0);
+        set_array_buffer(compute_encoder, y, 1);
+
+        // Encode output arrays to kernel
+        set_array_buffer(compute_encoder, out, 2);
+
+        // Encode alpha and beta
+        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
+        compute_encoder->setBytes(&beta_, sizeof(float), 4);
+
+        // Encode shape, strides and ndim 
+        compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
+        compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
+        compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
+        compute_encoder->setBytes(&ndim, sizeof(int), 8);
+
+        // We launch 1 thread for each input and make sure that the number of
+        // threads in any given threadgroup is not higher than the max allowed
+        size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());
+
+        // Fix the 3D size of each threadgroup (in terms of threads)
+        MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);
+
+        // Fix the 3D size of the launch grid (in terms of threads)
+        MTL::Size grid_dims = MTL::Size(nelem, 1, 1);
+
+        // Launch the grid with the given number of threads divded among
+        // the given threadgroups
+        compute_encoder->dispatchThreads(grid_dims, group_dims);
+    }
+
+We can now call the :meth:`axpby` operation on both the CPU and the GPU!
+
+A few things to note about MLX and metal before moving on. MLX keeps track 
+of the active ``compute_encoder``. We rely on :meth:`d.get_command_encoder` 
+to give us the active metal compute command encoder instead of building a 
+new one and calling :meth:`compute_encoder->end_encoding` at the end. 
+MLX keeps adding kernels (compute pipelines) to the active command encoder 
+until some specified limit is hit or the compute encoder needs to be flushed 
+for synchronization. MLX also handles enqueuing and commiting the associated 
+command buffers as needed. We suggest taking a deeper dive into 
+:class:`metal::Device` if you would like to study this routine further.
+
+Primitive Transforms
+^^^^^^^^^^^^^^^^^^^^^
+
+Now that we have come this far, let's also learn how to add implementations to 
+transformations in a :class:`Primitive`. These transformations can be built on 
+top of our operations, including the one we just defined now. Which then gives 
+us the following :meth:`Axpby::jvp` and :meth:`Axpby::vjp` implementations.
+
+.. code-block:: C++
+
+    /** The Jacobian-vector product. */
+    array Axpby::jvp(
+            const std::vector<array>& primals,
+            const std::vector<array>& tangents,
+            const std::vector<int>& argnums) {
+        // Forward mode diff that pushes along the tangents
+        // The jvp transform on the the primitive can built with ops
+        // that are scheduled on the same stream as the primtive
+
+        // If argnums = {0}, we only push along x in which case the
+        // jvp is just the tangent scaled by alpha
+        // Similarly, if argnums = {1}, the jvp is just the tangent
+        // scaled by beta
+        if (argnums.size() > 1) {
+            auto scale = argnums[0] == 0 ? alpha_ : beta_;
+            auto scale_arr = array(scale, tangents[0].dtype());
+            return multiply(scale_arr, tangents[0], stream());
+        }
+        // If, argnums = {0, 1}, we take contributions from both
+        // which gives us jvp = tangent_x * alpha + tangent_y * beta
+        else {
+            return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+        }
+    }
+
+.. code-block:: C++
+
+    /** The vector-Jacobian product. */
+    std::vector<array> Axpby::vjp(
+            const std::vector<array>& primals,
+            const array& cotan,
+            const std::vector<int>& argnums) {
+        // Reverse mode diff
+        std::vector<array> vjps;
+        for (auto arg : argnums) {
+            auto scale = arg == 0 ? alpha_ : beta_;
+            auto scale_arr = array(scale, cotan.dtype());
+            vjps.push_back(multiply(scale_arr, cotan, stream()));
+        }
+        return vjps;
+    }
+
+Finally, you need not have a transformation fully defined to start using your 
+own :class:`Primitive`.
+
+.. code-block:: C++
+
+    /** Vectorize primitve along given axis */
+    std::pair<array, int> Axpby::vmap(
+            const std::vector<array>& inputs,
+            const std::vector<int>& axes) {
+        throw std::runtime_error("Axpby has no vmap implementation.");
+    }
+
+Building and Binding
+--------------------
+
+Let's look at the overall directory structure first. 
+
+| extensions
+| ├── axpby
+| │   ├── axpby.cpp
+| │   ├── axpby.h
+| │   └── axpby.metal
+| ├── mlx_sample_extensions
+| │   └── __init__.py
+| ├── bindings.cpp
+| ├── CMakeLists.txt
+| └── setup.py
+
+* ``extensions/axpby/`` defines the C++ extension library
+* ``extensions/mlx_sample_extensions`` sets out the strucutre for the 
+  associated python package
+* ``extensions/bindings.cpp`` provides python bindings for our operation
+* ``extensions/CMakeLists.txt`` holds CMake rules to build the library and 
+  python bindings
+* ``extensions/setup.py`` holds the ``setuptools`` rules to build and install
+  the python package
+
+Binding to Python
+^^^^^^^^^^^^^^^^^^
+
+We use PyBind11_ to build a Python API for the C++ library. Since bindings 
+for all needed components such as `mlx.core.array`, `mlx.core.stream`, etc. 
+are already provided, adding our :meth:`axpby` becomes very simple!
+
+.. code-block:: C++
+
+    PYBIND11_MODULE(mlx_sample_extensions, m) {
+        m.doc() = "Sample C++ and metal extensions for MLX";
+
+        m.def(
+            "axpby",
+            &axpby,
+            "x"_a,
+            "y"_a,
+            py::pos_only(),
+            "alpha"_a,
+            "beta"_a,
+            py::kw_only(),
+            "stream"_a = py::none(),
+            R"pbdoc(
+                Scale and sum two vectors elementwise
+                ``z = alpha * x + beta * y``
+                
+                Follows numpy style broadcasting between ``x`` and ``y``
+                Inputs are upcasted to floats if needed
+
+                Args:
+                    x (array): Input array.
+                    y (array): Input array.
+                    alpha (float): Scaling factor for ``x``.
+                    beta (float): Scaling factor for ``y``.
+
+                Returns:
+                    array: ``alpha * x + beta * y``
+            )pbdoc");
+    }
+
+Most of the complexity in the above example comes from additional bells and 
+whistles such as the literal names and doc-strings.
+
+.. warning::
+
+    :mod:`mlx.core` needs to be imported before importing 
+    :mod:`mlx_sample_extensions` as defined by the pybind11 module above to 
+    ensure that the casters for :mod:`mlx.core` components like 
+    :class:`mlx.core.array` are available.
+
+.. _Building with CMake:
+
+Building with CMake
+^^^^^^^^^^^^^^^^^^^^
+
+Building the C++ extension library itself is simple, it only requires that you 
+``find_package(MLX CONFIG)`` and then link it to your library. 
+
+.. code-block:: cmake
+
+    # Add library
+    add_library(mlx_ext)
+
+    # Add sources
+    target_sources(
+        mlx_ext
+        PUBLIC
+        ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp
+    )
+
+    # Add include headers
+    target_include_directories(
+        mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}
+    )
+
+    # Link to mlx
+    target_link_libraries(mlx_ext PUBLIC mlx)
+
+We also need to build the attached metal library. For convenience, we provide a 
+:meth:`mlx_build_metallib` function that builds a ``.metallib`` target given 
+sources, headers, destinations, etc. (defined in ``cmake/extension.cmake`` and 
+automatically imported with MLX package). 
+
+Here is what that looks like in practice!
+
+.. code-block:: cmake
+
+    # Build metallib
+    if(MLX_BUILD_METAL)
+
+    mlx_build_metallib(
+        TARGET mlx_ext_metallib
+        TITLE mlx_ext
+        SOURCES ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
+        INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${MLX_INCLUDE_DIRS}
+        OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+    )
+
+    add_dependencies(
+        mlx_ext
+        mlx_ext_metallib
+    )
+
+    endif()
+
+Finally, we build the Pybind11_ bindings
+
+.. code-block:: cmake
+
+    pybind11_add_module(
+        mlx_sample_extensions
+        ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp
+    )
+    target_link_libraries(mlx_sample_extensions PRIVATE mlx_ext)
+
+    if(BUILD_SHARED_LIBS)
+        target_link_options(mlx_sample_extensions PRIVATE -Wl,-rpath,@loader_path)
+    endif()
+
+Building with ``setuptools``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once we have set out the CMake build rules as described above, we can use the
+build utilities defined in :mod:`mlx.extension` for a simple build process. 
+
+.. code-block:: python 
+
+    from mlx import extension
+    from setuptools import setup
+
+    if __name__ == "__main__":
+        setup(
+            name="mlx_sample_extensions",
+            version="0.0.0",
+            description="Sample C++ and Metal extensions for MLX primitives.",
+            ext_modules=[extension.CMakeExtension("mlx_sample_extensions")],
+            cmdclass={"build_ext": extension.CMakeBuild},
+            packages = ["mlx_sample_extensions"],
+            package_dir = {"": "mlx_sample_extensions"},
+            package_data = {"mlx_sample_extensions" : ["*.so", "*.dylib", "*.metallib"]},
+            zip_safe=False,
+            python_requires=">=3.7",
+        )
+
+.. note::
+    We treat ``extensions/mlx_sample_extensions`` as the package directory
+    even though it only contains a ``__init__.py`` to ensure the following:
+    
+    * :mod:`mlx.core` is always imported before importing  :mod:`mlx_sample_extensions`
+    * The C++ extension library and the metal library are co-located with the python 
+      bindings and copied together if the package is installed 
+
+You can build inplace for development using
+``python setup.py build_ext -j8 --inplace`` (in ``extensions/``)
+
+This will result in a directory structure as follows:
+
+| extensions
+| ├── mlx_sample_extensions
+| │   ├── __init__.py
+| │   ├── libmlx_ext.dylib # C++ extension library
+| │   ├── mlx_ext.metallib # Metal library
+| │   └── mlx_sample_extensions.cpython-3x-darwin.so # Python Binding
+| ...
+
+When you try to install using the command ``python -m pip install .`` 
+(in ``extensions/``), the package will be installed with the same strucutre as 
+``extensions/mlx_sample_extensions`` and the C++ and metal library will be 
+copied along with the python binding since they are specified as ``package_data``.
+
+Usage
+-----
+
+After installing the extension as described above, you should be able to simply 
+import the python package and play with it as you would any other MLX operation!
+
+Let's looks at a simple script and it's results!
+
+.. code-block:: python
+
+    import mlx.core as mx
+    from mlx_sample_extensions import axpby
+
+    a = mx.ones((3, 4))
+    b = mx.ones((3, 4))
+    c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
+
+    print(f"c shape: {c.shape}")
+    print(f"c dtype: {c.dtype}")
+    print(f"c correctness: {mx.all(c == 6.0).item()}")
+
+Output:
+
+.. code-block::
+
+    c shape: [3, 4]
+    c dtype: float32
+    c correctness: True
+
+Results
+^^^^^^^^^^^^^^^^
+
+Let's run a quick benchmark and see how our new ``axpby`` operation compares 
+with the naive :meth:`simple_axpby` we defined at first on the CPU. 
+
+.. code-block:: python 
+
+    import mlx.core as mx
+    from mlx_sample_extensions import axpby
+    import time
+
+    mx.set_default_device(mx.cpu)
+
+    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
+        return alpha * x + beta * y
+
+    M = 256
+    N = 512
+
+    x = mx.random.normal((M, N))
+    y = mx.random.normal((M, N))
+    alpha = 4.0
+    beta = 2.0
+
+    mx.eval((x, y))
+
+    def bench(f):
+        # Warm up
+        for i in range(100):
+            z = f(x, y, alpha, beta)
+            mx.eval(z)
+
+        # Timed run
+        s = time.time()
+        for i in range(5000):
+            z = f(x, y, alpha, beta)
+            mx.eval(z)
+        e = time.time()
+        return e - s
+
+    simple_time = bench(simple_axpby)
+    custom_time = bench(axpby)
+
+    print(f"Simple axpby: {simple_time:.3f} s | Custom axpby: {custom_time:.3f} s")
+
+Results:
+
+.. code-block::
+
+    Simple axpby: 0.114 s | Custom axpby: 0.109 s
+
+We see some modest improvements right away! 
+
+This operation is now good to be used to build other operations, 
+in :class:`mlx.nn.Module` calls, and also as a part of graph 
+transformations such as :meth:`grad` and :meth:`simplify`!
+
+Scripts
+-------
+
+.. admonition:: Download the code
+
+   The full example code is available in `mlx-examples <code>`_.
+
+.. code: `TODO_LINK/extensions`_
+
+.. _Accelerate: https://developer.apple.com/documentation/accelerate/blas?language=objc
+.. _Metal: https://developer.apple.com/documentation/metal?language=objc
+.. _Metal-cpp: https://developer.apple.com/metal/cpp/
+.. _`Metal Specification`: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+.. _`Metal Example`: https://developer.apple.com/documentation/metal/performing_calculations_on_a_gpu?language=objc
+.. _PyBind11: https://pybind11.readthedocs.io/en/stable/
--- a/docs/src/examples/linear_regression.rst
+++ b/docs/src/examples/linear_regression.rst
@ -0,0 +1,77 @@
+.. _linear_regression:
+
+Linear Regression
+-----------------
+
+Let's implement a basic linear regression model as a starting point to
+learn MLX. First import the core package and setup some problem metadata:
+
+.. code-block:: python
+
+  import mlx.core as mx
+
+  num_features = 100
+  num_examples = 1_000
+  num_iters = 10_000  # iterations of SGD
+  lr = 0.01  # learning rate for SGD
+
+
+We'll generate a synthetic dataset by:
+
+1. Sampling the design matrix ``X``.
+2. Sampling a ground truth parameter vector ``w_star``.
+3. Compute the dependent values ``y`` by adding Gaussian noise to ``X @ w_star``.
+
+.. code-block:: python
+
+  # True parameters
+  w_star = mx.random.normal((num_features,))
+
+  # Input examples (design matrix)
+  X = mx.random.normal((num_examples, num_features))
+
+  # Noisy labels
+  eps = 1e-2 * mx.random.normal((num_examples,))
+  y = X @ w_star + eps
+
+
+We will use SGD to find the optimal weights. To start, define the squared loss
+and get the gradient function of the loss with respect to the parameters.
+
+.. code-block:: python
+
+  def loss_fn(w):
+      return 0.5 * mx.mean(mx.square(X @ w - y))
+
+  grad_fn = mx.grad(loss_fn)
+
+Start the optimization by initializing the parameters ``w`` randomly. Then
+repeatedly update the parameters for ``num_iters`` iterations. 
+
+.. code-block:: python
+
+  w = 1e-2 * mx.random.normal((num_features,))
+
+  for _ in range(num_iters):
+      grad = grad_fn(w)
+      w = w - lr * grad
+      mx.eval(w)
+
+Finally, compute the loss of the learned parameters and verify that they are
+close to the ground truth parameters.
+
+.. code-block:: python
+
+  loss = loss_fn(w)
+  error_norm = mx.sum(mx.square(w - w_star)).item() ** 0.5
+
+  print(
+      f"Loss {loss.item():.5f}, |w-w*| = {error_norm:.5f}, "
+  )
+  # Should print something close to: Loss 0.00005, |w-w*| = 0.00364
+
+Complete `linear regression
+<https://github.com/ml-explore/mlx/tree/main/examples/python/linear_regression.py>`_
+and `logistic regression
+<https://github.com/ml-explore/mlx/tree/main/examples/python/logistic_regression.py>`_
+examples are available in the MLX GitHub repo.
--- a/docs/src/python/data_types.rst
+++ b/docs/src/python/data_types.rst
@ -0,0 +1,52 @@
+.. _data_types:
+
+:orphan:
+
+Data Types
+==========
+
+.. currentmodule:: mlx.core
+
+The default floating point type is ``float32`` and the default integer type is
+``int32``. The table below shows supported values for :obj:`Dtype`. 
+
+.. list-table:: Supported Data Types 
+   :widths: 5 3 20
+   :header-rows: 1
+
+   * - Type 
+     - Bytes
+     - Description
+   * - ``bool_``
+     - 1 
+     - Boolean (``True``, ``False``) data type
+   * - ``uint8``
+     - 1 
+     - 8-bit unsigned integer 
+   * - ``uint16``
+     - 2 
+     - 16-bit unsigned integer 
+   * - ``uint32``
+     - 4 
+     - 32-bit unsigned integer 
+   * - ``uint32``
+     - 8 
+     - 32-bit unsigned integer 
+   * - ``int8``
+     - 1 
+     - 8-bit signed integer 
+   * - ``int16``
+     - 2 
+     - 16-bit signed integer 
+   * - ``int32``
+     - 4 
+     - 32-bit signed integer 
+   * - ``int64``
+     - 8 
+     - 64-bit signed integer 
+   * - ``float16``
+     - 2 
+     - 16-bit float, only available with `ARM C language extensions <https://developer.arm.com/documentation/101028/0012/3--C-language-extensions?lang=en>`_
+   * - ``float32``
+     - 4 
+     - 32-bit float
--- a/docs/src/python/devices_and_streams.rst
+++ b/docs/src/python/devices_and_streams.rst
@ -0,0 +1,17 @@
+.. _devices_and_streams:
+
+Devices and Streams
+===================
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+   Device
+   default_device
+   set_default_device
+   Stream
+   default_stream
+   new_stream
+   set_default_stream
--- a/docs/src/python/transforms.rst
+++ b/docs/src/python/transforms.rst
@ -0,0 +1,16 @@
+.. _transforms:
+
+Transforms
+==========
+
+.. currentmodule:: mlx.core
+
+.. autosummary::
+  :toctree: _autosummary
+
+   eval
+   grad
+   value_and_grad
+   jvp
+   vjp
+   vmap
--- a/docs/src/python/tree_utils.rst
+++ b/docs/src/python/tree_utils.rst
@ -0,0 +1,21 @@
+.. _utils:
+
+Tree Utils
+==========
+
+In MLX we consider a python tree to be an arbitrarily nested collection of
+dictionaries, lists and tuples without cycles. Functions in this module that
+return python trees will be using the default python ``dict``, ``list`` and
+``tuple`` but they can usually process objects that inherit from any of these.
+
+.. note::
+   Dictionaries should have keys that are valid python identifiers.
+
+.. currentmodule:: mlx.utils
+
+.. autosummary:: 
+  :toctree: _autosummary
+
+   tree_flatten
+   tree_unflatten
+   tree_map
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@ -0,0 +1,10 @@
+function(build_example SRCFILE)
+  get_filename_component(src_name ${SRCFILE} NAME_WE)
+  set(target "${src_name}")
+  add_executable(${target} ${SRCFILE})
+  target_link_libraries(${target} PRIVATE mlx)
+endfunction(build_example)
+
+build_example(tutorial.cpp)
+build_example(linear_regression.cpp)
+build_example(logistic_regression.cpp)
--- a/examples/cpp/linear_regression.cpp
+++ b/examples/cpp/linear_regression.cpp
@ -0,0 +1,52 @@
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+#include "mlx/mlx.h"
+#include "timer.h"
+
+/**
+ * An example of linear regression with MLX.
+ */
+using namespace mlx::core;
+
+int main() {
+  int num_features = 100;
+  int num_examples = 1'000;
+  int num_iters = 10'000;
+  float learning_rate = 0.01;
+
+  // True parameters
+  auto w_star = random::normal({num_features});
+
+  // The input examples (design matrix)
+  auto X = random::normal({num_examples, num_features});
+
+  // Noisy labels
+  auto eps = 1e-2 * random::normal({num_examples});
+  auto y = matmul(X, w_star) + eps;
+
+  // Initialize random parameters
+  array w = 1e-2 * random::normal({num_features});
+
+  auto loss_fn = [&](array w) {
+    auto yhat = matmul(X, w);
+    return (0.5f / num_examples) * sum(square(yhat - y));
+  };
+
+  auto grad_fn = grad(loss_fn);
+
+  auto tic = timer::time();
+  for (int it = 0; it < num_iters; ++it) {
+    auto grad = grad_fn(w);
+    w = w - learning_rate * grad;
+    eval(w);
+  }
+  auto toc = timer::time();
+
+  auto loss = loss_fn(w);
+  auto error_norm = std::sqrt(sum(square(w - w_star)).item<float>());
+  auto throughput = num_iters / timer::seconds(toc - tic);
+  std::cout << "Loss " << loss << ", |w - w*| = " << error_norm
+            << ", Throughput " << throughput << " (it/s)." << std::endl;
+}
--- a/mlx/3rdparty/pocketfft.h
+++ b/mlx/3rdparty/pocketfft.h
--- a/mlx/allocator.h
+++ b/mlx/allocator.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <cstdlib>
+
+namespace mlx::core::allocator {
+
+// Simple wrapper around buffer pointers
+// WARNING: Only Buffer objects constructed from and those that wrap
+//          raw pointers from mlx::allocator are supported.
+class Buffer {
+ private:
+  void* ptr_;
+
+ public:
+  Buffer(void* ptr) : ptr_(ptr){};
+
+  // Get the raw data pointer from the buffer
+  void* raw_ptr();
+
+  // Get the buffer pointer from the buffer
+  const void* ptr() const {
+    return ptr_;
+  };
+  void* ptr() {
+    return ptr_;
+  };
+};
+
+Buffer malloc(size_t size);
+
+void free(Buffer buffer);
+
+// Wait for running tasks to finish and free up memory
+// if allocation fails
+Buffer malloc_or_wait(size_t size);
+
+class Allocator {
+  /** Abstract base clase for a memory allocator. */
+ public:
+  virtual Buffer malloc(size_t size) = 0;
+  virtual void free(Buffer buffer) = 0;
+
+  Allocator() = default;
+  Allocator(const Allocator& other) = delete;
+  Allocator(Allocator&& other) = delete;
+  Allocator& operator=(const Allocator& other) = delete;
+  Allocator& operator=(Allocator&& other) = delete;
+  virtual ~Allocator() = default;
+};
+
+Allocator& allocator();
+
+class CommonAllocator : public Allocator {
+  /** A general CPU allocator. */
+ public:
+  virtual Buffer malloc(size_t size) override;
+  virtual void free(Buffer buffer) override;
+
+ private:
+  CommonAllocator() = default;
+  friend Allocator& allocator();
+};
+
+} // namespace mlx::core::allocator
--- a/mlx/backend/accelerate/conv.cpp
+++ b/mlx/backend/accelerate/conv.cpp
@ -0,0 +1,18 @@
+#include <cassert>
+
+#include <simd/vector.h>
+#include <vecLib/vDSP.h>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core {
+
+void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
+  eval(inputs, out);
+
+  // TODO: Add accelerate based optimizations for CPU conv
+}
+
+} // namespace mlx::core
--- a/mlx/backend/accelerate/utils.h
+++ b/mlx/backend/accelerate/utils.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include <vecLib/BNNS/bnns.h>
+#include "mlx/dtype.h"
+
+namespace mlx::core {
+
+BNNSDataType to_bnns_dtype(Dtype mlx_dtype) {
+  uint32_t size_bits = size_of(mlx_dtype) * 8;
+  switch (kindof(mlx_dtype)) {
+    case Dtype::Kind::b:
+      return BNNSDataTypeBoolean;
+    case Dtype::Kind::u:
+      return BNNSDataType(BNNSDataTypeUIntBit | size_bits);
+    case Dtype::Kind::i:
+      return BNNSDataType(BNNSDataTypeIntBit | size_bits);
+    case Dtype::Kind::f:
+      return BNNSDataType(BNNSDataTypeFloatBit | size_bits);
+    case Dtype::Kind::V:
+      return BNNSDataTypeBFloat16;
+    case Dtype::Kind::c:
+      throw std::invalid_argument("BNNS does not support complex types");
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/common/copy.h
+++ b/mlx/backend/common/copy.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include "mlx/array.h"
+#include "mlx/backend/common/utils.h"
+
+namespace mlx::core {
+
+enum class CopyType {
+  // Copy a raw scalar input into the full contiguous output
+  Scalar,
+
+  // Copy the raw input buffer contiguously into a raw output buffer of the same
+  // size
+  Vector,
+
+  // Copy the full virtual input to the full contiguous output
+  General,
+
+  // Copy the full virtual input to the full virtual output. We assume the
+  // input and output have the same shape.
+  GeneralGeneral
+};
+
+void copy(const array& src, array& dst, CopyType ctype);
+void copy_inplace(const array& src, array& dst, CopyType ctype);
+
+} // namespace mlx::core
--- a/mlx/backend/common/sort.cpp
+++ b/mlx/backend/common/sort.cpp
@ -0,0 +1,394 @@
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <numeric>
+
+#include "mlx/backend/common/copy.h"
+#include "mlx/backend/common/utils.h"
+
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <typename T, typename IdxT = int32_t>
+struct StridedIterator {
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type = IdxT;
+  using value_type = T;
+  using reference = value_type&;
+  using pointer = value_type*;
+
+  // Constructors
+  StridedIterator() = default;
+
+  explicit StridedIterator(T* ptr, size_t stride, difference_type offset = 0)
+      : ptr_(ptr + offset * stride), stride_(stride) {}
+
+  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
+      : StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}
+
+  // Accessors
+  reference operator*() const {
+    return ptr_[0];
+  }
+
+  reference operator[](difference_type idx) const {
+    return ptr_[idx * stride_];
+  }
+
+  // Comparisons
+  bool operator==(const StridedIterator& other) const {
+    return ptr_ == other.ptr_ && stride_ == other.stride_;
+  }
+
+  bool operator!=(const StridedIterator& other) const {
+    return ptr_ != other.ptr_;
+  }
+
+  bool operator<(const StridedIterator& other) const {
+    return ptr_ < other.ptr_;
+  }
+
+  bool operator>(const StridedIterator& other) const {
+    return ptr_ > other.ptr_;
+  }
+
+  bool operator<=(const StridedIterator& other) const {
+    return ptr_ <= other.ptr_;
+  }
+
+  bool operator>=(const StridedIterator& other) const {
+    return ptr_ >= other.ptr_;
+  }
+
+  difference_type operator-(const StridedIterator& other) const {
+    return (ptr_ - other.ptr_) / stride_;
+  }
+
+  // Moving
+  StridedIterator& operator++() {
+    ptr_ += stride_;
+    return *this;
+  }
+
+  StridedIterator& operator--() {
+    ptr_ -= stride_;
+    return *this;
+  }
+
+  StridedIterator& operator+=(difference_type diff) {
+    ptr_ += diff * stride_;
+    return *this;
+  }
+
+  StridedIterator& operator-=(difference_type diff) {
+    ptr_ -= diff * stride_;
+    return *this;
+  }
+
+  StridedIterator operator+(difference_type diff) {
+    return StridedIterator(ptr_, stride_, diff);
+  }
+
+  StridedIterator operator-(difference_type diff) {
+    return StridedIterator(ptr_, stride_, -diff);
+  }
+
+ private:
+  size_t stride_;
+  T* ptr_;
+};
+
+template <typename T, typename IdxT = uint32_t>
+void sort(const array& in, array& out, int axis) {
+  // Copy input to output
+  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  copy(in, out, ctype);
+
+  // Get axis, shape and stride info
+  axis = axis < 0 ? axis + in.ndim() : axis;
+  size_t n_rows = in.size() / in.shape(axis);
+
+  auto remaining_shape = in.shape();
+  remaining_shape.erase(remaining_shape.begin() + axis);
+
+  auto remaining_strides = in.strides();
+  remaining_strides.erase(remaining_strides.begin() + axis);
+
+  size_t axis_stride = in.strides()[axis];
+  int axis_size = in.shape(axis);
+
+  // Perform sorting in place
+  for (int i = 0; i < n_rows; i++) {
+    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    T* data_ptr = out.data<T>() + loc;
+
+    StridedIterator st(data_ptr, axis_stride, 0);
+    StridedIterator ed(data_ptr, axis_stride, axis_size);
+
+    std::stable_sort(st, ed);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void argsort(const array& in, array& out, int axis) {
+  // Allocate output
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  // Get axis, shape and stride info
+  axis = axis < 0 ? axis + in.ndim() : axis;
+  size_t n_rows = in.size() / in.shape(axis);
+
+  auto remaining_shape = in.shape();
+  remaining_shape.erase(remaining_shape.begin() + axis);
+
+  auto remaining_strides = in.strides();
+  remaining_strides.erase(remaining_strides.begin() + axis);
+
+  size_t axis_stride = in.strides()[axis];
+  int axis_size = in.shape(axis);
+
+  // Perform sorting
+  for (int i = 0; i < n_rows; i++) {
+    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    const T* data_ptr = in.data<T>() + loc;
+    IdxT* idx_ptr = out.data<IdxT>() + loc;
+
+    StridedIterator st_(idx_ptr, axis_stride, 0);
+    StridedIterator ed_(idx_ptr, axis_stride, axis_size);
+
+    // Initialize with iota
+    std::iota(st_, ed_, IdxT(0));
+
+    // Sort according to vals
+    StridedIterator st(idx_ptr, axis_stride, 0);
+    StridedIterator ed(idx_ptr, axis_stride, axis_size);
+
+    std::stable_sort(st, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
+      auto v1 = data_ptr[a * axis_stride];
+      auto v2 = data_ptr[b * axis_stride];
+      return v1 < v2 || (v1 == v2 && a < b);
+    });
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void partition(const array& in, array& out, int axis, int kth) {
+  // Copy input to output
+  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
+  copy(in, out, ctype);
+
+  // Get axis, shape and stride info
+  axis = axis < 0 ? axis + in.ndim() : axis;
+  size_t n_rows = in.size() / in.shape(axis);
+
+  auto remaining_shape = in.shape();
+  remaining_shape.erase(remaining_shape.begin() + axis);
+
+  auto remaining_strides = in.strides();
+  remaining_strides.erase(remaining_strides.begin() + axis);
+
+  size_t axis_stride = in.strides()[axis];
+  int axis_size = in.shape(axis);
+
+  kth = kth < 0 ? kth + axis_size : kth;
+
+  // Perform partition in place
+  for (int i = 0; i < n_rows; i++) {
+    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    T* data_ptr = out.data<T>() + loc;
+
+    StridedIterator st(data_ptr, axis_stride, 0);
+    StridedIterator md(data_ptr, axis_stride, kth);
+    StridedIterator ed(data_ptr, axis_stride, axis_size);
+
+    std::nth_element(st, md, ed);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void argpartition(const array& in, array& out, int axis, int kth) {
+  // Allocate output
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  // Get axis, shape and stride info
+  axis = axis < 0 ? axis + in.ndim() : axis;
+  size_t n_rows = in.size() / in.shape(axis);
+
+  auto remaining_shape = in.shape();
+  remaining_shape.erase(remaining_shape.begin() + axis);
+
+  auto remaining_strides = in.strides();
+  remaining_strides.erase(remaining_strides.begin() + axis);
+
+  size_t axis_stride = in.strides()[axis];
+  int axis_size = in.shape(axis);
+
+  kth = kth < 0 ? kth + axis_size : kth;
+
+  // Perform partition
+  for (int i = 0; i < n_rows; i++) {
+    size_t loc = elem_to_loc(i, remaining_shape, remaining_strides);
+    const T* data_ptr = in.data<T>() + loc;
+    IdxT* idx_ptr = out.data<IdxT>() + loc;
+
+    StridedIterator st_(idx_ptr, axis_stride, 0);
+    StridedIterator ed_(idx_ptr, axis_stride, axis_size);
+
+    // Initialize with iota
+    std::iota(st_, ed_, IdxT(0));
+
+    // Sort according to vals
+    StridedIterator st(idx_ptr, axis_stride, 0);
+    StridedIterator md(idx_ptr, axis_stride, kth);
+    StridedIterator ed(idx_ptr, axis_stride, axis_size);
+
+    std::nth_element(st, md, ed, [data_ptr, axis_stride](IdxT a, IdxT b) {
+      auto v1 = data_ptr[a * axis_stride];
+      auto v2 = data_ptr[b * axis_stride];
+      return v1 < v2 || (v1 == v2 && a < b);
+    });
+  }
+}
+
+} // namespace
+
+void ArgSort::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  switch (in.dtype()) {
+    case bool_:
+      return argsort<bool>(in, out, axis_);
+    case uint8:
+      return argsort<uint8_t>(in, out, axis_);
+    case uint16:
+      return argsort<uint16_t>(in, out, axis_);
+    case uint32:
+      return argsort<uint32_t>(in, out, axis_);
+    case uint64:
+      return argsort<uint64_t>(in, out, axis_);
+    case int8:
+      return argsort<int8_t>(in, out, axis_);
+    case int16:
+      return argsort<int16_t>(in, out, axis_);
+    case int32:
+      return argsort<int32_t>(in, out, axis_);
+    case int64:
+      return argsort<int64_t>(in, out, axis_);
+    case float32:
+      return argsort<float>(in, out, axis_);
+    case float16:
+      return argsort<float16_t>(in, out, axis_);
+    case bfloat16:
+      return argsort<bfloat16_t>(in, out, axis_);
+    case complex64:
+      return argsort<complex64_t>(in, out, axis_);
+  }
+}
+
+void Sort::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  switch (in.dtype()) {
+    case bool_:
+      return sort<bool>(in, out, axis_);
+    case uint8:
+      return sort<uint8_t>(in, out, axis_);
+    case uint16:
+      return sort<uint16_t>(in, out, axis_);
+    case uint32:
+      return sort<uint32_t>(in, out, axis_);
+    case uint64:
+      return sort<uint64_t>(in, out, axis_);
+    case int8:
+      return sort<int8_t>(in, out, axis_);
+    case int16:
+      return sort<int16_t>(in, out, axis_);
+    case int32:
+      return sort<int32_t>(in, out, axis_);
+    case int64:
+      return sort<int64_t>(in, out, axis_);
+    case float32:
+      return sort<float>(in, out, axis_);
+    case float16:
+      return sort<float16_t>(in, out, axis_);
+    case bfloat16:
+      return sort<bfloat16_t>(in, out, axis_);
+    case complex64:
+      return sort<complex64_t>(in, out, axis_);
+  }
+}
+
+void ArgPartition::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  switch (in.dtype()) {
+    case bool_:
+      return argpartition<bool>(in, out, axis_, kth_);
+    case uint8:
+      return argpartition<uint8_t>(in, out, axis_, kth_);
+    case uint16:
+      return argpartition<uint16_t>(in, out, axis_, kth_);
+    case uint32:
+      return argpartition<uint32_t>(in, out, axis_, kth_);
+    case uint64:
+      return argpartition<uint64_t>(in, out, axis_, kth_);
+    case int8:
+      return argpartition<int8_t>(in, out, axis_, kth_);
+    case int16:
+      return argpartition<int16_t>(in, out, axis_, kth_);
+    case int32:
+      return argpartition<int32_t>(in, out, axis_, kth_);
+    case int64:
+      return argpartition<int64_t>(in, out, axis_, kth_);
+    case float32:
+      return argpartition<float>(in, out, axis_, kth_);
+    case float16:
+      return argpartition<float16_t>(in, out, axis_, kth_);
+    case bfloat16:
+      return argpartition<bfloat16_t>(in, out, axis_, kth_);
+    case complex64:
+      return argpartition<complex64_t>(in, out, axis_, kth_);
+  }
+}
+
+void Partition::eval(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  auto& in = inputs[0];
+
+  switch (in.dtype()) {
+    case bool_:
+      return partition<bool>(in, out, axis_, kth_);
+    case uint8:
+      return partition<uint8_t>(in, out, axis_, kth_);
+    case uint16:
+      return partition<uint16_t>(in, out, axis_, kth_);
+    case uint32:
+      return partition<uint32_t>(in, out, axis_, kth_);
+    case uint64:
+      return partition<uint64_t>(in, out, axis_, kth_);
+    case int8:
+      return partition<int8_t>(in, out, axis_, kth_);
+    case int16:
+      return partition<int16_t>(in, out, axis_, kth_);
+    case int32:
+      return partition<int32_t>(in, out, axis_, kth_);
+    case int64:
+      return partition<int64_t>(in, out, axis_, kth_);
+    case float32:
+      return partition<float>(in, out, axis_, kth_);
+    case float16:
+      return partition<float16_t>(in, out, axis_, kth_);
+    case bfloat16:
+      return partition<bfloat16_t>(in, out, axis_, kth_);
+    case complex64:
+      return partition<complex64_t>(in, out, axis_, kth_);
+  }
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/device.cpp
+++ b/mlx/backend/metal/device.cpp
@ -0,0 +1,257 @@
+#include <dlfcn.h>
+#include <cstdlib>
+#include <filesystem>
+#include <sstream>
+
+#define NS_PRIVATE_IMPLEMENTATION
+#define CA_PRIVATE_IMPLEMENTATION
+#define MTL_PRIVATE_IMPLEMENTATION
+
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/metal.h"
+#include "mlx/backend/metal/mps/gemm.h"
+
+namespace fs = std::filesystem;
+
+namespace mlx::core::metal {
+
+static Device metal_device_;
+
+namespace {
+
+// TODO nicer way to set this or possibly expose as an environment variable
+static constexpr int MAX_BUFFERS_PER_QUEUE = 12;
+
+static constexpr const char* default_mtllib_path = METAL_PATH;
+
+auto load_device() {
+  MTL::Device* device = MTL::CreateSystemDefaultDevice();
+  if (!device) {
+    throw std::runtime_error("Failed to load device");
+  }
+  return device;
+}
+
+std::pair<MTL::Library*, NS::Error*> load_library_from_path(
+    MTL::Device* device,
+    const char* path) {
+  auto library = NS::String::string(path, NS::UTF8StringEncoding);
+  NS::Error* error;
+  auto lib = device->newLibrary(library, &error);
+
+  return std::make_pair(lib, error);
+}
+
+MTL::Library* load_library(
+    MTL::Device* device,
+    const std::string& lib_name = "mlx",
+    const char* lib_path = default_mtllib_path) {
+  // Firstly, search for the metallib in the same path as this binary
+  std::string first_path = get_colocated_mtllib_path(lib_name);
+  if (first_path.size() != 0) {
+    auto [lib, error] = load_library_from_path(device, first_path.c_str());
+    if (lib) {
+      return lib;
+    }
+  }
+
+  // Couldn't find it so let's load it from default_mtllib_path
+  {
+    auto [lib, error] = load_library_from_path(device, lib_path);
+    if (!lib) {
+      std::ostringstream msg;
+      msg << error->localizedDescription()->utf8String() << "\n"
+          << "Failed to load device library from <" << lib_path << ">"
+          << " or <" << first_path << ">.";
+      throw std::runtime_error(msg.str());
+    }
+    return lib;
+  }
+}
+
+} // namespace
+
+Device::Device()
+    : pool_(NS::AutoreleasePool::alloc()->init()),
+      device_(load_device()),
+      library_map_({{"mlx", load_library(device_)}}) {}
+
+Device::~Device() {
+  for (auto& q : queue_map_) {
+    q.second->release();
+  }
+  for (auto& k : kernel_map_) {
+    k.second->release();
+  }
+  for (auto& l : library_map_) {
+    l.second->release();
+  }
+  device_->release();
+  pool_->release();
+}
+
+void Device::new_queue(int index) {
+  // Multiple threads can ask the device for queues
+  // We lock this as a critical section for safety
+  const std::lock_guard<std::mutex> lock(mtx_);
+  auto q = device_->newCommandQueue(MAX_BUFFERS_PER_QUEUE);
+  if (!q) {
+    throw std::runtime_error(
+        "[metal::Device] Failed to make new command queue.");
+  }
+  queue_map_.insert({index, q});
+}
+
+int Device::get_command_buffer_ops(int index) {
+  auto bit = buffer_map_.find(index);
+  return bit->second.first;
+}
+
+void Device::increment_command_buffer_ops(int index) {
+  auto bit = buffer_map_.find(index);
+  bit->second.first++;
+}
+
+MTL::CommandBuffer* Device::get_command_buffer(int index) {
+  auto bit = buffer_map_.find(index);
+  return (bit == buffer_map_.end()) ? nullptr : bit->second.second;
+}
+
+MTL::CommandBuffer* Device::new_command_buffer(int index) {
+  auto qit = queue_map_.find(index);
+  if (qit == queue_map_.end()) {
+    throw std::runtime_error(
+        "[metal::Device] Attempting to get command buffer for invalid queue.");
+  }
+
+  auto cb = qit->second->commandBufferWithUnretainedReferences();
+
+  if (!cb) {
+    throw std::runtime_error(
+        "[metal::Device] Unable to create new command buffer");
+  }
+
+  // Increment ref count so the buffer is not garbage collected
+  cb->retain();
+
+  return buffer_map_.insert({index, {0, cb}}).first->second.second;
+}
+
+void Device::commit_command_buffer(int index) {
+  auto bit = buffer_map_.find(index);
+  bit->second.second->commit();
+  bit->second.second->release();
+  buffer_map_.erase(bit);
+}
+
+void Device::end_encoding(int index) {
+  auto eit = encoder_map_.find(index);
+  if (eit != encoder_map_.end()) {
+    eit->second->endEncoding();
+    eit->second->release();
+    encoder_map_.erase(eit);
+  }
+}
+
+MTL::ComputeCommandEncoder* Device::get_command_encoder(int index) {
+  auto eit = encoder_map_.find(index);
+  if (eit == encoder_map_.end()) {
+    auto cb = get_command_buffer(index);
+    auto compute_encoder = cb->computeCommandEncoder();
+    // Increment ref count so the buffer is not garbage collected
+    compute_encoder->retain();
+    eit = encoder_map_.insert({index, compute_encoder}).first;
+  }
+  return eit->second;
+}
+
+MTL::ArgumentEncoder* Device::argument_encoder(
+    const std::vector<MTL::ArgumentDescriptor*>& arg_descs) const {
+  // NB array here is already autoreleased but the returned argument
+  // encoder is owned by the caller and must be released/autoreleased
+  NS::Array* arg_desc_arr = NS::Array::array(
+      reinterpret_cast<NS::Object* const*>(arg_descs.data()), arg_descs.size());
+  return device_->newArgumentEncoder(arg_desc_arr);
+}
+
+void Device::register_library(
+    const std::string& lib_name,
+    const std::string& lib_path) {
+  if (auto it = library_map_.find(lib_name); it == library_map_.end()) {
+    auto new_lib = load_library(device_, lib_name, lib_path.c_str());
+    library_map_.insert({lib_name, new_lib});
+  }
+}
+
+void Device::register_library(
+    const std::string& lib_name,
+    const std::function<std::string(const std::string&)>& lib_path_func) {
+  if (auto it = library_map_.find(lib_name); it == library_map_.end()) {
+    std::string new_lib_path = lib_path_func(lib_name);
+    auto new_lib = load_library(device_, lib_name, new_lib_path.c_str());
+    library_map_.insert({lib_name, new_lib});
+  }
+}
+
+MTL::ComputePipelineState* Device::get_kernel(
+    const std::string& name,
+    const std::string& lib_name /* = "mlx" */) {
+  // Look for cached kernel
+  if (auto it = kernel_map_.find(name); it != kernel_map_.end()) {
+    return it->second;
+  }
+
+  // Prepare new kernel
+
+  // Search for cached metal lib
+  MTL::Library* mtl_lib;
+  if (auto it = library_map_.find(name); it != library_map_.end()) {
+    mtl_lib = it->second;
+  } else { // Look for metallib alongside library
+    register_library(lib_name);
+    mtl_lib = library_map_[lib_name];
+  }
+
+  // Pull kernel from library
+  auto ns_name = NS::String::string(name.c_str(), NS::ASCIIStringEncoding);
+  auto mtl_function = mtl_lib->newFunction(ns_name);
+
+  // Compile kernel to compute pipeline
+  NS::Error* error = nullptr;
+  MTL::ComputePipelineState* kernel;
+  if (mtl_function) {
+    kernel = device_->newComputePipelineState(mtl_function, &error);
+    mtl_function->release();
+  }
+  if (!mtl_function || !kernel) {
+    std::ostringstream msg;
+    msg << "[metal::Device] Unable to load kernel " << name << "\n";
+    if (error) {
+      msg << error->localizedDescription()->utf8String() << "\n";
+    }
+    throw std::runtime_error(msg.str());
+  }
+
+  // Add kernel to cache
+  kernel_map_.insert({name, kernel});
+  return kernel;
+}
+
+Device& device(mlx::core::Device) {
+  return metal_device_;
+}
+
+NS::AutoreleasePool*& thread_autorelease_pool() {
+  static thread_local NS::AutoreleasePool* p =
+      NS::AutoreleasePool::alloc()->init();
+  return p;
+}
+
+void new_stream(Stream stream) {
+  thread_autorelease_pool();
+  if (stream.device == mlx::core::Device::gpu) {
+    device(stream.device).new_queue(stream.index);
+  }
+}
+
+} // namespace mlx::core::metal
--- a/mlx/backend/metal/fft.cpp
+++ b/mlx/backend/metal/fft.cpp
@ -0,0 +1,10 @@
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
+  auto& in = inputs[0];
+  throw std::runtime_error("[FFT] NYI for Metal backend.");
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@ -0,0 +1,83 @@
+set(
+  HEADERS
+  ${CMAKE_CURRENT_SOURCE_DIR}/bf16.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/bf16_math.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/complex.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/defines.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/erf.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/reduce.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+)
+
+set(
+  KERNELS
+  "arange"
+  "arg_reduce"
+  "binary"
+  "conv"
+  "copy"
+  "gemm"
+  "gemv"
+  "random"
+  "reduce"
+  "scan"
+  "softmax"
+  "sort"
+  "unary"
+  "indexing"
+)
+
+function(build_kernel KERNEL)
+  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
+  set(HEADERS_PADDED ${HEADERS})
+  if(${KERNEL} STREQUAL "gemm")
+    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/gemm.h)
+  endif()
+  if(${KERNEL} STREQUAL "conv")
+    set(HEADERS_PADDED ${HEADERS_PADDED} ${CMAKE_CURRENT_SOURCE_DIR}/gemm/conv.h)
+  endif()
+  add_custom_command(
+    COMMAND xcrun -sdk macosx metal -Wall -Wextra
+                  -fno-fast-math
+                  -c ${SRCFILE} 
+                  -I${PROJECT_SOURCE_DIR} 
+                  -o ${KERNEL}.air
+    DEPENDS ${SRCFILE} ${HEADERS_PADDED}
+    OUTPUT ${KERNEL}.air
+    COMMENT "Building ${KERNEL}.air"
+    VERBATIM
+  )
+endfunction(build_kernel)
+
+foreach(KERNEL ${KERNELS})
+  build_kernel(${KERNEL})
+  set(KERNEL_AIR ${KERNEL}.air ${KERNEL_AIR})
+endforeach()
+
+add_custom_command(
+  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
+  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o ${MLX_METAL_PATH}/mlx.metallib
+  DEPENDS ${KERNEL_AIR}
+  COMMENT "Building mlx.metallib"
+  VERBATIM
+)
+
+add_custom_target(
+  mlx-metallib
+  DEPENDS
+  ${MLX_METAL_PATH}/mlx.metallib
+)
+
+add_dependencies(
+  mlx
+  mlx-metallib
+)
+
+# Install metallib
+include(GNUInstallDirs)
+
+install(
+  FILES ${MLX_METAL_PATH}/mlx.metallib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  COMPONENT metallib
+)
--- a/mlx/backend/metal/kernels/conv_params.h
+++ b/mlx/backend/metal/kernels/conv_params.h
@ -0,0 +1,17 @@
+#pragma once
+
+template <int NDIM>
+struct MLXConvParams {
+  const int N; // Batch size
+  const int C; // In channels
+  const int O; // Out channels
+  const int iS[NDIM]; // Input spatial dim
+  const int wS[NDIM]; // Weight spatial dim
+  const int oS[NDIM]; // Output spatial dim
+  const int str[NDIM]; // Kernel strides
+  const int pad[NDIM]; // Input padding
+  const int dil[NDIM]; // Kernel dilation
+  const size_t in_strides[NDIM + 2]; // In strides
+  const size_t wt_strides[NDIM + 2]; // Wt strides
+  const size_t out_strides[NDIM + 2]; // Out strides
+};
--- a/mlx/backend/metal/kernels/indexing.metal
+++ b/mlx/backend/metal/kernels/indexing.metal
@ -0,0 +1,253 @@
+#include <metal_atomic>
+#include <metal_texture>
+
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/reduce.h"
+#include "mlx/backend/metal/kernels/utils.h"
+
+using namespace metal;
+
+/////////////////////////////////////////////////////////////////////
+// Gather kernel
+/////////////////////////////////////////////////////////////////////
+
+template <typename IdxT, int NIDX>
+struct Indices {
+  const array<device IdxT*, NIDX> buffers [[id(0)]];
+  device int* shapes [[id(NIDX + 1)]];
+  device size_t* strides [[id(NIDX + 2)]];
+  const int ndim [[id(NIDX + 3)]];
+};
+
+template <typename IdxT>
+inline size_t offset_neg_idx(IdxT idx, size_t size) {
+  return (idx < 0) ? idx + size : idx;
+}
+
+template <>
+inline size_t offset_neg_idx(bool idx, size_t) {
+  return idx;
+}
+
+template <>
+inline size_t offset_neg_idx(uint32_t idx, size_t) {
+  return idx;
+}
+
+template <typename T, typename IdxT, int NIDX>
+[[kernel]] void gather(
+    const device T *src [[buffer(0)]],
+    const device Indices<IdxT, NIDX>& indices [[buffer(1)]],
+    device T *out [[buffer(2)]],
+    const device int *src_shape [[buffer(3)]],
+    const device size_t *src_strides [[buffer(4)]],
+    const device size_t& src_ndim [[buffer(5)]],
+    const device int *slice_sizes [[buffer(6)]],
+    const device size_t& slice_size [[buffer(7)]],
+    const device int *axes [[buffer(8)]],
+    uint gid [[thread_position_in_grid]]) {
+
+  auto ind_idx = gid / slice_size;
+  auto ind_offset = gid % slice_size;
+
+  size_t src_idx = 0;
+  for (int i = 0; i < NIDX; ++i) {
+    auto idx_loc = elem_to_loc(
+        ind_idx,
+        &indices.shapes[indices.ndim * i],
+        &indices.strides[indices.ndim * i],
+        indices.ndim);
+    auto ax = axes[i];
+    auto idx_val = offset_neg_idx(
+        indices.buffers[i][idx_loc], src_shape[ax]);
+    src_idx += idx_val * src_strides[ax];
+  }
+
+  auto src_offset = elem_to_loc(
+      ind_offset, slice_sizes, src_strides, src_ndim);
+  out[gid] = src[src_idx + src_offset];
+}
+
+#define instantiate_gather4(name, src_type, ind_type, nindex) \
+template [[host_name("gather" name "_" #nindex)]] \
+[[kernel]] void gather( \
+    const device src_type *src [[buffer(0)]], \
+    const device Indices<ind_type, nindex>& indices [[buffer(1)]], \
+    device src_type *out [[buffer(2)]], \
+    const device int *src_shape [[buffer(3)]], \
+    const device size_t *src_strides [[buffer(4)]], \
+    const device size_t& src_ndim [[buffer(5)]], \
+    const device int *slice_sizes [[buffer(6)]], \
+    const device size_t& slice_size [[buffer(7)]], \
+    const device int* axes [[buffer(8)]], \
+    uint gid [[thread_position_in_grid]]);
+
+// Special for case NIDX=0
+instantiate_gather4("bool_", bool, bool, 0)
+instantiate_gather4("uint8", uint8_t, bool, 0)
+instantiate_gather4("uint16", uint16_t, bool, 0)
+instantiate_gather4("uint32", uint32_t, bool, 0)
+instantiate_gather4("uint64", uint64_t, bool, 0)
+instantiate_gather4("int8", int8_t, bool, 0)
+instantiate_gather4("int16", int16_t, bool, 0)
+instantiate_gather4("int32", int32_t, bool, 0)
+instantiate_gather4("int64", int64_t, bool, 0)
+instantiate_gather4("float16", half, bool, 0)
+instantiate_gather4("float32", float, bool, 0)
+instantiate_gather4("bfloat16", bfloat16_t, bool, 0)
+
+#define instantiate_gather3(name, src_type, ind_type) \
+  instantiate_gather4(name, src_type, ind_type, 1) \
+  instantiate_gather4(name, src_type, ind_type, 2) \
+  instantiate_gather4(name, src_type, ind_type, 3) \
+  instantiate_gather4(name, src_type, ind_type, 4) \
+  instantiate_gather4(name, src_type, ind_type, 5) \
+  instantiate_gather4(name, src_type, ind_type, 6) \
+  instantiate_gather4(name, src_type, ind_type, 7) \
+  instantiate_gather4(name, src_type, ind_type, 8) \
+  instantiate_gather4(name, src_type, ind_type, 9) \
+  instantiate_gather4(name, src_type, ind_type, 10)
+
+#define instantiate_gather(name, src_type) \
+  instantiate_gather3(#name "bool_", src_type, bool) \
+  instantiate_gather3(#name "uint8", src_type, uint8_t) \
+  instantiate_gather3(#name "uint16", src_type, uint16_t) \
+  instantiate_gather3(#name "uint32", src_type, uint32_t) \
+  instantiate_gather3(#name "uint64", src_type, uint64_t) \
+  instantiate_gather3(#name "int8", src_type, int8_t) \
+  instantiate_gather3(#name "int16", src_type, int16_t) \
+  instantiate_gather3(#name "int32", src_type, int32_t) \
+  instantiate_gather3(#name "int64", src_type, int64_t)
+
+instantiate_gather(bool_, bool)
+instantiate_gather(uint8, uint8_t)
+instantiate_gather(uint16, uint16_t)
+instantiate_gather(uint32, uint32_t)
+instantiate_gather(uint64, uint64_t)
+instantiate_gather(int8, int8_t)
+instantiate_gather(int16, int16_t)
+instantiate_gather(int32, int32_t)
+instantiate_gather(int64, int64_t)
+instantiate_gather(float16, half)
+instantiate_gather(float32, float)
+instantiate_gather(bfloat16, bfloat16_t)
+
+/////////////////////////////////////////////////////////////////////
+// Scatter kernel
+/////////////////////////////////////////////////////////////////////
+
+template <typename T, typename IdxT, typename Op, int NIDX>
+[[kernel]] void scatter(
+    const device Indices<IdxT, NIDX>& indices [[buffer(0)]],
+    const device T *updates [[buffer(1)]],
+    device mlx_atomic<T> *out [[buffer(2)]],
+    const device int *upd_shape [[buffer(3)]],
+    const device size_t *upd_strides [[buffer(4)]],
+    const device size_t& upd_ndim [[buffer(5)]],
+    const device size_t& upd_size [[buffer(6)]],
+    const device int *out_shape [[buffer(7)]],
+    const device size_t *out_strides [[buffer(8)]],
+    const device size_t& out_ndim [[buffer(9)]],
+    const device int* axes [[buffer(10)]],
+    uint gid [[thread_position_in_grid]]) {
+
+  Op op;
+  auto ind_idx = gid / upd_size;
+  auto ind_offset = gid % upd_size;
+
+  size_t out_idx = 0;
+  for (int i = 0; i < NIDX; ++i) {
+    auto idx_loc = elem_to_loc(
+        ind_idx,
+        &indices.shapes[indices.ndim * i],
+        &indices.strides[indices.ndim * i],
+        indices.ndim);
+    auto ax = axes[i];
+    auto idx_val = offset_neg_idx(
+        indices.buffers[i][idx_loc], out_shape[ax]);
+    out_idx += idx_val * out_strides[ax];
+  }
+
+  auto out_offset = elem_to_loc(
+      ind_offset, upd_shape + indices.ndim, out_strides, out_ndim);
+  auto upd_idx = elem_to_loc(gid, upd_shape, upd_strides, upd_ndim);
+
+  op.atomic_update(out + out_idx + out_offset, updates[upd_idx]);
+}
+
+#define instantiate_scatter4(name, type, ind_type, op_type, nindex) \
+template [[host_name("scatter" name "_" #nindex)]] \
+[[kernel]] void scatter<type, ind_type, op_type, nindex>( \
+    const device Indices<ind_type, nindex>& indices [[buffer(0)]], \
+    const device type *updates [[buffer(1)]], \
+    device mlx_atomic<type> *out [[buffer(2)]], \
+    const device int *upd_shape [[buffer(3)]], \
+    const device size_t *upd_strides [[buffer(4)]], \
+    const device size_t& upd_ndim [[buffer(5)]], \
+    const device size_t& upd_size [[buffer(6)]], \
+    const device int *out_shape [[buffer(7)]], \
+    const device size_t *out_strides [[buffer(8)]], \
+    const device size_t& out_ndim [[buffer(9)]], \
+    const device int* axes [[buffer(10)]], \
+    uint gid [[thread_position_in_grid]]);
+
+// Special case NINDEX=0
+#define instantiate_scatter_nd0(name, type) \
+  instantiate_scatter4(#name "none", type, bool, None, 0) \
+  instantiate_scatter4(#name "_sum", type, bool, Sum<type>, 0) \
+  instantiate_scatter4(#name "_prod", type, bool, Prod<type>, 0) \
+  instantiate_scatter4(#name "_max", type, bool, Max<type>, 0) \
+  instantiate_scatter4(#name "_min", type, bool, Min<type>, 0)
+
+#define instantiate_scatter3(name, type, ind_type, op_type) \
+  instantiate_scatter4(name, type, ind_type, op_type, 1) \
+  instantiate_scatter4(name, type, ind_type, op_type, 2) \
+  instantiate_scatter4(name, type, ind_type, op_type, 3) \
+  instantiate_scatter4(name, type, ind_type, op_type, 4) \
+  instantiate_scatter4(name, type, ind_type, op_type, 5) \
+  instantiate_scatter4(name, type, ind_type, op_type, 6) \
+  instantiate_scatter4(name, type, ind_type, op_type, 7) \
+  instantiate_scatter4(name, type, ind_type, op_type, 8) \
+  instantiate_scatter4(name, type, ind_type, op_type, 9) \
+  instantiate_scatter4(name, type, ind_type, op_type, 10)
+
+#define instantiate_scatter2(name, type, ind_type) \
+  instantiate_scatter3(name "_none", type, ind_type, None) \
+  instantiate_scatter3(name "_sum", type, ind_type, Sum<type>) \
+  instantiate_scatter3(name "_prod", type, ind_type, Prod<type>) \
+  instantiate_scatter3(name "_max", type, ind_type, Max<type>) \
+  instantiate_scatter3(name "_min", type, ind_type, Min<type>)
+
+#define instantiate_scatter(name, type) \
+  instantiate_scatter2(#name "bool_", type, bool) \
+  instantiate_scatter2(#name "uint8", type, uint8_t) \
+  instantiate_scatter2(#name "uint16", type, uint16_t) \
+  instantiate_scatter2(#name "uint32", type, uint32_t) \
+  instantiate_scatter2(#name "uint64", type, uint64_t) \
+  instantiate_scatter2(#name "int8", type, int8_t) \
+  instantiate_scatter2(#name "int16", type, int16_t) \
+  instantiate_scatter2(#name "int32", type, int32_t) \
+  instantiate_scatter2(#name "int64", type, int64_t)
+
+// TODO uint64 and int64 unsupported
+instantiate_scatter_nd0(bool_, bool)
+instantiate_scatter_nd0(uint8, uint8_t)
+instantiate_scatter_nd0(uint16, uint16_t)
+instantiate_scatter_nd0(uint32, uint32_t)
+instantiate_scatter_nd0(int8, int8_t)
+instantiate_scatter_nd0(int16, int16_t)
+instantiate_scatter_nd0(int32, int32_t)
+instantiate_scatter_nd0(float16, half)
+instantiate_scatter_nd0(float32, float)
+instantiate_scatter_nd0(bfloat16, bfloat16_t)
+
+instantiate_scatter(bool_, bool)
+instantiate_scatter(uint8, uint8_t)
+instantiate_scatter(uint16, uint16_t)
+instantiate_scatter(uint32, uint32_t)
+instantiate_scatter(int8, int8_t)
+instantiate_scatter(int16, int16_t)
+instantiate_scatter(int32, int32_t)
+instantiate_scatter(float16, half)
+instantiate_scatter(float32, float)
+instantiate_scatter(bfloat16, bfloat16_t)
--- a/mlx/backend/metal/kernels/reduce.h
+++ b/mlx/backend/metal/kernels/reduce.h
@ -0,0 +1,174 @@
+#pragma once
+
+#include <metal_atomic>
+#include <metal_simdgroup>
+
+#include "mlx/backend/metal/kernels/atomic.h"
+#include "mlx/backend/metal/kernels/bf16.h"
+#include "mlx/backend/metal/kernels/utils.h"
+
+union bool4_or_uint {
+  bool4 b;
+  unsigned int i;
+};
+
+struct None {
+  template <typename T>
+  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+    mlx_atomic_store_explicit(out, val, offset);
+  }
+};
+
+struct And {
+  bool simd_reduce(bool val) {
+    return simd_all(val);
+  };
+
+  static constexpr constant bool init = true;
+
+  void atomic_update(
+      device mlx_atomic<unsigned int>* out,
+      bool val,
+      int elem_idx,
+      int offset = 0) {
+    if (!val) {
+      bool4_or_uint update;
+      update.b = {true, true, true, true};
+      update.b[elem_idx] = false;
+      mlx_atomic_fetch_and_explicit(out, update.i, offset);
+    }
+  }
+
+  void atomic_update(device mlx_atomic<bool>* out, bool val, int offset = 0) {
+    if (!val) {
+      mlx_atomic_store_explicit(out, val, offset);
+    }
+  }
+
+  // Non atomic update
+  void update(device bool* out, bool val) {
+    *out &= val;
+  }
+
+  // Operator
+  bool operator()(bool a, bool b) {
+    return a && b;
+  }
+};
+
+struct Or {
+  bool simd_reduce(bool val) {
+    return simd_any(val);
+  };
+
+  static constexpr constant bool init = false;
+
+  void atomic_update(
+      device mlx_atomic<unsigned int>* out,
+      bool val,
+      int elem_idx,
+      int offset = 0) {
+    if (val) {
+      bool4_or_uint update;
+      update.b = {false, false, false, false};
+      update.b[elem_idx] = true;
+      mlx_atomic_fetch_or_explicit(out, update.i, offset);
+    }
+  }
+
+  void atomic_update(device mlx_atomic<bool>* out, bool val, int offset = 0) {
+    if (val) {
+      mlx_atomic_store_explicit(out, val, offset);
+    }
+  }
+
+  // Non atomic update
+  void update(device bool* out, bool val) {
+    *out |= val;
+  }
+
+  // Operator
+  bool operator()(bool a, bool b) {
+    return a || b;
+  }
+};
+
+template <typename U>
+struct Sum {
+  template <typename T>
+  T simd_reduce(T val) {
+    return simd_sum(val);
+  };
+
+  static constexpr constant U init = U(0);
+
+  template <typename T>
+  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+    mlx_atomic_fetch_add_explicit(out, val, offset);
+  }
+
+  // Operator
+  U operator()(U a, U b) {
+    return a + b;
+  }
+};
+
+template <typename U>
+struct Prod {
+  template <typename T>
+  T simd_reduce(T val) {
+    return simd_product(val);
+  };
+
+  static constexpr constant U init = U(1);
+
+  template <typename T>
+  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+    mlx_atomic_fetch_mul_explicit(out, val, offset);
+  }
+
+  // Operator
+  U operator()(U a, U b) {
+    return a * b;
+  }
+};
+
+template <typename U>
+struct Min {
+  template <typename T>
+  T simd_reduce(T val) {
+    return simd_min(val);
+  };
+
+  static constexpr constant U init = Limits<U>::max;
+
+  template <typename T>
+  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+    mlx_atomic_fetch_min_explicit(out, val, offset);
+  }
+
+  // Operator
+  U operator()(U a, U b) {
+    return a < b ? a : b;
+  }
+};
+
+template <typename U>
+struct Max {
+  template <typename T>
+  T simd_reduce(T val) {
+    return simd_max(val);
+  };
+
+  static constexpr constant U init = Limits<U>::min;
+
+  template <typename T>
+  void atomic_update(device mlx_atomic<T>* out, T val, int offset = 0) {
+    mlx_atomic_fetch_max_explicit(out, val, offset);
+  }
+
+  // Operator
+  U operator()(U a, U b) {
+    return a > b ? a : b;
+  }
+};
--- a/mlx/backend/metal/kernels/scan.metal
+++ b/mlx/backend/metal/kernels/scan.metal
@ -0,0 +1,492 @@
+#include <metal_math>
+#include <metal_simdgroup>
+
+#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/kernels/utils.h"
+
+using namespace metal;
+
+template <typename U>
+struct CumSum {
+  static constexpr constant U init = static_cast<U>(0);
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return a + b;
+  }
+
+  U simd_scan(U x) {
+    return simd_prefix_inclusive_sum(x);
+  }
+
+  U simd_exclusive_scan(U x) {
+    return simd_prefix_exclusive_sum(x);
+  }
+};
+
+template <typename U>
+struct CumProd {
+  static constexpr constant U init = static_cast<U>(1.0f);
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return a * b;
+  }
+
+  U simd_scan(U x) {
+    return simd_prefix_inclusive_product(x);
+  }
+
+  U simd_exclusive_scan(U x) {
+    return simd_prefix_exclusive_product(x);
+  }
+};
+
+template <>
+struct CumProd<bool> {
+  static constexpr constant bool init = true;
+
+  template <typename T>
+  bool operator()(bool a, T b) {
+    return a & static_cast<bool>(b);
+  }
+
+  bool simd_scan(bool x) {
+    for (int i=1; i<=16; i*=2) {
+      bool other = simd_shuffle_up(x, i);
+      x &= other;
+    }
+    return x;
+  }
+
+  bool simd_exclusive_scan(bool x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename U>
+struct CumMax {
+  static constexpr constant U init = Limits<U>::min;
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a >= b) ? a : b;
+  }
+
+  U simd_scan(U x) {
+    for (int i=1; i<=16; i*=2) {
+      U other = simd_shuffle_up(x, i);
+      x = (x >= other) ? x : other;
+    }
+    return x;
+  }
+
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename U>
+struct CumMin {
+  static constexpr constant U init = Limits<U>::max;
+
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a <= b) ? a : b;
+  }
+
+  U simd_scan(U x) {
+    for (int i=1; i<=16; i*=2) {
+      U other = simd_shuffle_up(x, i);
+      x = (x <= other) ? x : other;
+    }
+    return x;
+  }
+
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_unsafe(U values[N_READS], const device T * input) {
+  if (reverse) {
+    for (int i=0; i<N_READS; i++) {
+      values[N_READS-i-1] = input[i];
+    }
+  } else {
+    for (int i=0; i<N_READS; i++) {
+      values[i] = input[i];
+    }
+  }
+}
+
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_safe(U values[N_READS], const device T * input, int start, int total, U init) {
+  if (reverse) {
+    for (int i=0; i<N_READS; i++) {
+      values[N_READS-i-1] = (start + N_READS - i - 1 < total) ? input[i] : init;
+    }
+  } else {
+    for (int i=0; i<N_READS; i++) {
+      values[i] = (start + i < total) ? input[i] : init;
+    }
+  }
+}
+
+template <typename U, int N_READS, bool reverse>
+inline void write_unsafe(U values[N_READS], device U * out) {
+  if (reverse) {
+    for (int i=0; i<N_READS; i++) {
+      out[i] = values[N_READS-i-1];
+    }
+  } else {
+    for (int i=0; i<N_READS; i++) {
+      out[i] = values[i];
+    }
+  }
+}
+
+template <typename U, int N_READS, bool reverse>
+inline void write_safe(U values[N_READS], device U * out, int start, int total) {
+  if (reverse) {
+    for (int i=0; i<N_READS; i++) {
+      if (start + N_READS - i - 1 < total) {
+        out[i] = values[N_READS-i-1];
+      }
+    }
+  } else {
+    for (int i=0; i<N_READS; i++) {
+      if (start + i < total) {
+        out[i] = values[i];
+      }
+    }
+  }
+}
+
+
+template <typename T, typename U, typename Op, int N_READS, bool inclusive, bool reverse>
+[[kernel]] void contiguous_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t & axis_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  Op op;
+
+  // Position the pointers
+  in += (gid / lsize) * axis_size;
+  out += (gid / lsize) * axis_size;
+
+  // Compute the number of simd_groups
+  uint simd_groups = lsize / simd_size;
+
+  // Allocate memory
+  U prefix = Op::init;
+  U values[N_READS];
+  threadgroup U simdgroup_sums[32];
+
+  // Loop over the reduced axis in blocks of size ceildiv(axis_size, N_READS*lsize)
+  //    Read block
+  //    Compute inclusive scan of the block
+  //      Compute inclusive scan per thread
+  //      Compute exclusive scan of thread sums in simdgroup
+  //      Write simdgroup sums in SM
+  //      Compute exclusive scan of simdgroup sums
+  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread, value
+  //    Write block
+
+  for (uint r = 0; r < ceildiv(axis_size, N_READS*lsize); r++) {
+    // Compute the block offset
+    uint offset = r*lsize*N_READS + lid*N_READS;
+
+    // Read the values
+    if (reverse) {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(values, in + axis_size - offset - N_READS);
+      } else {
+        load_safe<T, U, N_READS, reverse>(values, in + axis_size - offset - N_READS, offset, axis_size, Op::init);
+      }
+    } else {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(values, in + offset);
+      } else {
+        load_safe<T, U, N_READS, reverse>(values, in + offset, offset, axis_size, Op::init);
+      }
+    }
+
+    // Compute an inclusive scan per thread
+    for (int i=1; i<N_READS; i++) {
+      values[i] = op(values[i], values[i-1]);
+    }
+
+    // Compute exclusive scan of thread sums
+    U prev_thread = op.simd_exclusive_scan(values[N_READS-1]);
+
+    // Write simdgroup_sums to SM
+    if (simd_lane_id == simd_size - 1) {
+      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute exclusive scan of simdgroup_sums
+    if (simd_group_id == 0) {
+      U prev_simdgroup = op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
+      simdgroup_sums[simd_lane_id] = prev_simdgroup;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Compute the output
+    for (int i=0; i<N_READS; i++) {
+      values[i] = op(values[i], prefix);
+      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
+      values[i] = op(values[i], prev_thread);
+    }
+
+    // Write the values
+    if (reverse) {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + axis_size - offset - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(values, out + axis_size - offset - N_READS, offset, axis_size);
+        }
+      } else {
+        if (lid == 0 && offset == 0) {
+          out[axis_size-1] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + axis_size - offset - 1 - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(values, out + axis_size - offset - 1 - N_READS, offset + 1, axis_size);
+        }
+      }
+    } else {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset);
+        } else {
+          write_safe<U, N_READS, reverse>(values, out + offset, offset, axis_size);
+        }
+      } else {
+        if (lid == 0 && offset == 0) {
+          out[0] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset + 1);
+        } else {
+          write_safe<U, N_READS, reverse>(values, out + offset + 1, offset + 1, axis_size);
+        }
+      }
+    }
+
+    // Share the prefix
+    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
+      simdgroup_sums[0] = values[N_READS-1];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    prefix = simdgroup_sums[0];
+  }
+}
+
+template <typename T, typename U, typename Op, int N_READS, bool inclusive, bool reverse>
+[[kernel]] void strided_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t & axis_size [[buffer(2)]],
+    const constant size_t & stride [[buffer(3)]],
+    uint2 gid [[threadgroup_position_in_grid]],
+    uint2 lid [[thread_position_in_threadgroup]],
+    uint2 lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]]) {
+  Op op;
+
+  // Allocate memory
+  threadgroup U read_buffer[N_READS*32*32 + N_READS*32];
+  U values[N_READS];
+  U prefix[N_READS];
+  for (int i=0; i<N_READS; i++) {
+    prefix[i] = Op::init;
+  }
+
+  // Compute offsets
+  int offset = gid.y * axis_size * stride;
+  int global_index_x = gid.x * lsize.y * N_READS;
+
+  for (uint j=0; j<axis_size; j+=simd_size) {
+    // Calculate the indices for the current thread
+    uint index_y = j + lid.y;
+    uint check_index_y = index_y;
+    uint index_x = global_index_x + lid.x * N_READS;
+    if (reverse) {
+      index_y = axis_size - 1 - index_y;
+    }
+
+    // Read in SM
+    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
+      for (int i=0; i<N_READS; i++) {
+        read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] = in[offset + index_y * stride + index_x + i];
+      }
+    } else {
+      for (int i=0; i<N_READS; i++) {
+        if (check_index_y < axis_size && (index_x + i) < stride) {
+          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] = in[offset + index_y * stride + index_x + i];
+        } else {
+          read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i] = Op::init;
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Read strided into registers
+    for (int i=0; i<N_READS; i++) {
+      values[i] = read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i];
+    }
+    // Do we need the following barrier? Shouldn't all simd threads execute simultaneously?
+    simdgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Perform the scan
+    for (int i=0; i<N_READS; i++) {
+      values[i] = op.simd_scan(values[i]);
+      values[i] = op(values[i], prefix[i]);
+      prefix[i] = simd_shuffle(values[i], simd_size-1);
+    }
+
+    // Write to SM
+    for (int i=0; i<N_READS; i++) {
+      read_buffer[lid.x * simd_size * N_READS + lid.y * N_READS + i] = values[i];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // Write to device memory
+    if (!inclusive) {
+      if (check_index_y == 0) {
+        if ((index_x + N_READS) < stride) {
+          for (int i=0; i<N_READS; i++) {
+            out[offset + index_y * stride + index_x + i] = Op::init;
+          }
+        } else {
+          for (int i=0; i<N_READS; i++) {
+            if ((index_x + i) < stride) {
+              out[offset + index_y * stride + index_x + i] = Op::init;
+            }
+          }
+        }
+      }
+      if (reverse) {
+        index_y -= 1;
+        check_index_y += 1;
+      } else {
+        index_y += 1;
+        check_index_y += 1;
+      }
+    }
+    if (check_index_y < axis_size && (index_x + N_READS) < stride) {
+      for (int i=0; i<N_READS; i++) {
+        out[offset + index_y * stride + index_x + i] = read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
+      }
+    } else {
+      for (int i=0; i<N_READS; i++) {
+        if (check_index_y < axis_size && (index_x + i) < stride) {
+          out[offset + index_y * stride + index_x + i] = read_buffer[lid.y * simd_size * N_READS + lid.x * N_READS + i];
+        }
+      }
+    }
+  }
+}
+
+#define instantiate_contiguous_scan(name, itype, otype, op, inclusive, reverse, nreads) \
+  template [[host_name("contiguous_scan_" #name)]] \
+  [[kernel]] void contiguous_scan<itype, otype, op<otype>, nreads, inclusive, reverse>( \
+    const device itype* in [[buffer(0)]], \
+    device otype* out [[buffer(1)]], \
+    const constant size_t & axis_size [[buffer(2)]], \
+    uint gid [[thread_position_in_grid]], \
+    uint lid [[thread_position_in_threadgroup]], \
+    uint lsize [[threads_per_threadgroup]], \
+    uint simd_size [[threads_per_simdgroup]], \
+    uint simd_lane_id [[thread_index_in_simdgroup]], \
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_strided_scan(name, itype, otype, op, inclusive, reverse, nreads) \
+  template [[host_name("strided_scan_" #name)]] \
+  [[kernel]] void strided_scan<itype, otype, op<otype>, nreads, inclusive, reverse>( \
+    const device itype* in [[buffer(0)]], \
+    device otype* out [[buffer(1)]], \
+    const constant size_t & axis_size [[buffer(2)]], \
+    const constant size_t & stride [[buffer(3)]], \
+    uint2 gid [[thread_position_in_grid]], \
+    uint2 lid [[thread_position_in_threadgroup]], \
+    uint2 lsize [[threads_per_threadgroup]], \
+    uint simd_size [[threads_per_simdgroup]]);
+
+
+#define instantiate_scan_helper(name, itype, otype, op, nreads) \
+  instantiate_contiguous_scan(inclusive_##name, itype, otype, op, true, false, nreads) \
+  instantiate_contiguous_scan(exclusive_##name, itype, otype, op, false, false, nreads) \
+  instantiate_contiguous_scan(reverse_inclusive_##name, itype, otype, op, true, true, nreads) \
+  instantiate_contiguous_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads) \
+  instantiate_strided_scan(inclusive_##name, itype, otype, op, true, false, nreads) \
+  instantiate_strided_scan(exclusive_##name, itype, otype, op, false, false, nreads) \
+  instantiate_strided_scan(reverse_inclusive_##name, itype, otype, op, true, true, nreads) \
+  instantiate_strided_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads)
+
+instantiate_scan_helper(sum_bool__int32,         bool,        int32_t,     CumSum, 4)
+instantiate_scan_helper(sum_uint8_uint8,         uint8_t,     uint8_t,     CumSum, 4)
+instantiate_scan_helper(sum_uint16_uint16,       uint16_t,    uint16_t,    CumSum, 4)
+instantiate_scan_helper(sum_uint32_uint32,       uint32_t,    uint32_t,    CumSum, 4)
+//instantiate_scan_helper(sum_uint64_uint64,       uint64_t,    uint64_t,    CumSum, 2)
+instantiate_scan_helper(sum_int8_int8,           int8_t,      int8_t,      CumSum, 4)
+instantiate_scan_helper(sum_int16_int16,         int16_t,     int16_t,     CumSum, 4)
+instantiate_scan_helper(sum_int32_int32,         int32_t,     int32_t,     CumSum, 4)
+//instantiate_scan_helper(sum_int64_int64,         int64_t,     int64_t,     CumSum, 2)
+instantiate_scan_helper(sum_float16_float16,     half,        half,        CumSum, 4)
+instantiate_scan_helper(sum_float32_float32,     float,       float,       CumSum, 4)
+//instantiate_scan_helper(sum_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumSum, 4)
+//instantiate_scan_helper(sum_complex64_complex64, complex64_t, complex64_t, CumSum)
+//instantiate_scan_helper(prod_bool__bool_,         bool,        bool,        CumProd, 4)
+instantiate_scan_helper(prod_uint8_uint8,         uint8_t,     uint8_t,     CumProd, 4)
+instantiate_scan_helper(prod_uint16_uint16,       uint16_t,    uint16_t,    CumProd, 4)
+instantiate_scan_helper(prod_uint32_uint32,       uint32_t,    uint32_t,    CumProd, 4)
+//instantiate_scan_helper(prod_uint64_uint64,       uint64_t,    uint64_t,    CumProd, 2)
+instantiate_scan_helper(prod_int8_int8,           int8_t,      int8_t,      CumProd, 4)
+instantiate_scan_helper(prod_int16_int16,         int16_t,     int16_t,     CumProd, 4)
+instantiate_scan_helper(prod_int32_int32,         int32_t,     int32_t,     CumProd, 4)
+//instantiate_scan_helper(prod_int64_int64,         int64_t,     int64_t,     CumProd, 2)
+instantiate_scan_helper(prod_float16_float16,     half,        half,        CumProd, 4)
+instantiate_scan_helper(prod_float32_float32,     float,       float,       CumProd, 4)
+//instantiate_scan_helper(prod_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumProd, 4)
+//instantiate_scan_helper(prod_complex64_complex64, complex64_t, complex64_t, CumProd)
+//instantiate_scan_helper(max_bool__bool_,         bool,        bool,        CumMax, 4)
+instantiate_scan_helper(max_uint8_uint8,         uint8_t,     uint8_t,     CumMax, 4)
+instantiate_scan_helper(max_uint16_uint16,       uint16_t,    uint16_t,    CumMax, 4)
+instantiate_scan_helper(max_uint32_uint32,       uint32_t,    uint32_t,    CumMax, 4)
+//instantiate_scan_helper(max_uint64_uint64,       uint64_t,    uint64_t,    CumMax, 2)
+instantiate_scan_helper(max_int8_int8,           int8_t,      int8_t,      CumMax, 4)
+instantiate_scan_helper(max_int16_int16,         int16_t,     int16_t,     CumMax, 4)
+instantiate_scan_helper(max_int32_int32,         int32_t,     int32_t,     CumMax, 4)
+//instantiate_scan_helper(max_int64_int64,         int64_t,     int64_t,     CumMax, 2)
+instantiate_scan_helper(max_float16_float16,     half,        half,        CumMax, 4)
+instantiate_scan_helper(max_float32_float32,     float,       float,       CumMax, 4)
+//instantiate_scan_helper(max_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumMax, 4)
+//instantiate_scan_helper(max_complex64_complex64, complex64_t, complex64_t, CumMax)
+//instantiate_scan_helper(min_bool__bool_,         bool,        bool,        CumMin, 4)
+instantiate_scan_helper(min_uint8_uint8,         uint8_t,     uint8_t,     CumMin, 4)
+instantiate_scan_helper(min_uint16_uint16,       uint16_t,    uint16_t,    CumMin, 4)
+instantiate_scan_helper(min_uint32_uint32,       uint32_t,    uint32_t,    CumMin, 4)
+//instantiate_scan_helper(min_uint64_uint64,       uint64_t,    uint64_t,    CumMin, 2)
+instantiate_scan_helper(min_int8_int8,           int8_t,      int8_t,      CumMin, 4)
+instantiate_scan_helper(min_int16_int16,         int16_t,     int16_t,     CumMin, 4)
+instantiate_scan_helper(min_int32_int32,         int32_t,     int32_t,     CumMin, 4)
+//instantiate_scan_helper(min_int64_int64,         int64_t,     int64_t,     CumMin, 2)
+instantiate_scan_helper(min_float16_float16,     half,        half,        CumMin, 4)
+instantiate_scan_helper(min_float32_float32,     float,       float,       CumMin, 4)
+//instantiate_scan_helper(min_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumMin, 4)
+//instantiate_scan_helper(min_complex64_complex64, complex64_t, complex64_t, CumMin)
--- a/mlx/backend/metal/metal.cpp
+++ b/mlx/backend/metal/metal.cpp
@ -0,0 +1,88 @@
+#include <cstdlib>
+#include <future>
+#include <memory>
+
+#include "mlx/array.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/primitives.h"
+#include "mlx/scheduler.h"
+
+namespace mlx::core::metal {
+
+int max_ops_per_buffer() {
+  auto get_val = []() {
+    if (const char* buff_str = std::getenv("MLX_MAX_OPS_PER_BUFFER")) {
+      return atoi(buff_str);
+    } else {
+      return 10;
+    }
+  };
+  static int max_ops_per_buffer_ = get_val();
+  return max_ops_per_buffer_;
+}
+
+#define MAX_OPS_PER_BUFFER max_ops_per_buffer()
+
+MTL::CommandBuffer* increment_command_buffer(Stream s) {
+  auto& d = metal::device(s.device);
+  auto command_buffer = d.get_command_buffer(s.index);
+  if (command_buffer == nullptr ||
+      d.get_command_buffer_ops(s.index) >= MAX_OPS_PER_BUFFER) {
+    if (command_buffer != nullptr) {
+      d.end_encoding(s.index);
+      scheduler::notify_new_task(s);
+      command_buffer->addCompletedHandler(
+          [s](MTL::CommandBuffer*) { scheduler::notify_task_completion(s); });
+      d.commit_command_buffer(s.index);
+    }
+    command_buffer = d.new_command_buffer(s.index);
+  }
+  d.increment_command_buffer_ops(s.index);
+  return command_buffer;
+}
+
+std::function<void()> make_task(
+    array& arr,
+    std::vector<std::shared_future<void>> deps,
+    std::shared_ptr<std::promise<void>> p,
+    bool retain_graph) {
+  auto task =
+      [retain_graph, arr, deps = std::move(deps), p = std::move(p)]() mutable {
+        for (auto& d : deps) {
+          d.wait();
+        }
+        auto s = arr.primitive().stream();
+        auto command_buffer = increment_command_buffer(s);
+        arr.primitive().eval_gpu(arr.inputs(), arr);
+        if (p) {
+          metal::device(s.device).end_encoding(s.index);
+          scheduler::notify_new_task(s);
+          command_buffer->addCompletedHandler(
+              [retain_graph, s, arr, p = std::move(p)](
+                  MTL::CommandBuffer*) mutable {
+                if (!retain_graph) {
+                  arr.detach();
+                }
+                p->set_value();
+                // Signal this thread to clear the pool on a synchroniztion.
+                scheduler::enqueue(s, []() {
+                  thread_autorelease_pool()->release();
+                  thread_autorelease_pool() =
+                      NS::AutoreleasePool::alloc()->init();
+                });
+                scheduler::notify_task_completion(s);
+              });
+          metal::device(s.device).commit_command_buffer(s.index);
+        } else {
+          command_buffer->addCompletedHandler(
+              [retain_graph, s, arr](MTL::CommandBuffer*) mutable {
+                if (!retain_graph) {
+                  arr.detach();
+                }
+              });
+        }
+      };
+  return task;
+}
+
+} // namespace mlx::core::metal
--- a/mlx/backend/metal/metal.h
+++ b/mlx/backend/metal/metal.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include <future>
+#include <memory>
+#include <vector>
+
+#include "mlx/array.h"
+#include "mlx/stream.h"
+
+namespace mlx::core::metal {
+
+constexpr bool is_available() {
+#ifdef _METAL_
+  return true;
+#else
+  return false;
+#endif
+}
+
+void new_stream(Stream stream);
+
+std::function<void()> make_task(
+    array& arr,
+    std::vector<std::shared_future<void>> deps,
+    std::shared_ptr<std::promise<void>> p,
+    bool retain_graph);
+
+} // namespace mlx::core::metal
--- a/mlx/backend/metal/softmax.cpp
+++ b/mlx/backend/metal/softmax.cpp
@ -0,0 +1,82 @@
+#include <algorithm>
+
+#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/kernels/defines.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+  if (!is_floating_point(out.dtype())) {
+    throw std::runtime_error(
+        "[softmax] Does not support non-floating point types.");
+  }
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+
+  // Make sure that the last dimension is contiguous
+  std::vector<array> copies;
+  auto check_input = [&copies, &s](const array& x) {
+    if (x.strides()[x.ndim() - 1] == 1) {
+      return x;
+    } else {
+      array x_copy(x.shape(), x.dtype(), nullptr, {});
+      copy_gpu(x, x_copy, CopyType::General, s);
+      copies.push_back(x_copy);
+      return x_copy;
+    }
+  };
+  const array& in = check_input(inputs[0]);
+  out.set_data(
+      allocator::malloc_or_wait(in.data_size() * in.itemsize()),
+      in.data_size(),
+      in.strides(),
+      in.flags());
+
+  int axis_size = in.shape().back();
+  int n_rows = in.data_size() / axis_size;
+
+  const int simd_size = 32;
+  const int n_reads = SOFTMAX_N_READS;
+  const int looped_limit = SOFTMAX_LOOPED_LIMIT;
+  std::string op_name = "softmax_";
+  if (axis_size > looped_limit) {
+    op_name += "looped_";
+  }
+  op_name += type_to_name(out);
+  auto compute_encoder = d.get_command_encoder(s.index);
+  {
+    auto kernel = d.get_kernel(op_name);
+
+    MTL::Size grid_dims, group_dims;
+    if (axis_size <= looped_limit) {
+      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
+      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
+      size_t threadgroup_size = simd_size * simds_needed;
+      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
+      size_t n_threads = n_rows * threadgroup_size;
+      grid_dims = MTL::Size(n_threads, 1, 1);
+      group_dims = MTL::Size(threadgroup_size, 1, 1);
+    } else {
+      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
+      size_t n_threads = n_rows * threadgroup_size;
+      grid_dims = MTL::Size(n_threads, 1, 1);
+      group_dims = MTL::Size(threadgroup_size, 1, 1);
+    }
+
+    compute_encoder->setComputePipelineState(kernel);
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, out, 1);
+    compute_encoder->setBytes(&axis_size, sizeof(int), 2);
+    compute_encoder->setThreadgroupMemoryLength(simd_size * in.itemsize(), 0);
+    compute_encoder->setThreadgroupMemoryLength(simd_size * in.itemsize(), 1);
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+  }
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+}
+
+} // namespace mlx::core
--- a/mlx/backend/metal/sort.cpp
+++ b/mlx/backend/metal/sort.cpp
@ -0,0 +1,336 @@
+#include <algorithm>
+
+#include "mlx/backend/metal/copy.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/utils.h"
+#include "mlx/primitives.h"
+
+namespace mlx::core {
+
+namespace {
+
+template <bool ARGSORT>
+void single_block_sort(
+    const Stream& s,
+    metal::Device& d,
+    const array& in,
+    array& out,
+    int axis,
+    int bn,
+    int tn) {
+  // Prepare shapes
+  int n_rows = in.size() / in.shape(axis);
+
+  std::vector<size_t> nc_str = in.strides();
+  nc_str.erase(nc_str.begin() + axis);
+
+  std::vector<int> nc_shape = in.shape();
+  nc_shape.erase(nc_shape.begin() + axis);
+
+  int nc_dim = nc_shape.size();
+
+  int size_sorted_axis = in.shape(axis);
+  int stride_sorted_axis = in.strides()[axis];
+  int stride_segment_axis = *std::min_element(nc_str.begin(), nc_str.end());
+
+  // Check if remaining strides are contiguous
+  bool contiguous_write = true;
+  if (axis != in.ndim() - 1 && axis != 0) {
+    for (int i = 0; i < nc_str.size() - 1; ++i) {
+      size_t expected = nc_str[i + 1] * nc_str[i + 1];
+      contiguous_write &= (nc_str[i] == expected);
+    }
+  }
+
+  // Prepare kernel name
+  std::ostringstream kname;
+  if (ARGSORT) {
+    kname << "arg_";
+  }
+  kname << "block_merge_sort_" << type_to_name(in) << "_" << type_to_name(out)
+        << "_bn" << bn << "_tn" << tn;
+
+  if (!contiguous_write) {
+    kname << "_nc";
+  }
+
+  // Prepare command encoder
+  auto compute_encoder = d.get_command_encoder(s.index);
+  auto kernel = d.get_kernel(kname.str());
+  compute_encoder->setComputePipelineState(kernel);
+
+  // Set inputs
+  set_array_buffer(compute_encoder, in, 0);
+  set_array_buffer(compute_encoder, out, 1);
+  compute_encoder->setBytes(&size_sorted_axis, sizeof(int), 2);
+  compute_encoder->setBytes(&stride_sorted_axis, sizeof(int), 3);
+
+  if (contiguous_write) {
+    compute_encoder->setBytes(&stride_segment_axis, sizeof(int), 4);
+  } else {
+    compute_encoder->setBytes(&nc_dim, sizeof(int), 4);
+    compute_encoder->setBytes(nc_shape.data(), nc_dim * sizeof(int), 5);
+    compute_encoder->setBytes(nc_str.data(), nc_dim * sizeof(size_t), 6);
+  }
+
+  MTL::Size group_dims = MTL::Size(bn, 1, 1);
+  MTL::Size grid_dims = MTL::Size(1, n_rows, 1);
+
+  compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+}
+
+template <bool ARGSORT>
+void multi_block_sort(
+    const Stream& s,
+    metal::Device& d,
+    const array& in,
+    array& out,
+    int axis,
+    int bn,
+    int tn,
+    int n_blocks) {
+  // Prepare shapes
+  int n_rows = in.size() / in.shape(axis);
+
+  std::vector<size_t> nc_str = in.strides();
+  nc_str.erase(nc_str.begin() + axis);
+
+  std::vector<int> nc_shape = in.shape();
+  nc_shape.erase(nc_shape.begin() + axis);
+
+  int nc_dim = nc_shape.size();
+
+  int size_sorted_axis = in.shape(axis);
+  int stride_sorted_axis = in.strides()[axis];
+
+  // Make temporary copies
+  array dev_vals_0({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});
+  array dev_vals_1({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});
+
+  array dev_idxs_0({n_rows, size_sorted_axis}, uint32, nullptr, {});
+  array dev_idxs_1({n_rows, size_sorted_axis}, uint32, nullptr, {});
+
+  array block_partitions({n_rows, n_blocks + 1}, uint32, nullptr, {});
+
+  // Do allocations
+  dev_vals_0.set_data(allocator::malloc_or_wait(dev_vals_0.nbytes()));
+  dev_vals_1.set_data(allocator::malloc_or_wait(dev_vals_1.nbytes()));
+  dev_idxs_0.set_data(allocator::malloc_or_wait(dev_idxs_0.nbytes()));
+  dev_idxs_1.set_data(allocator::malloc_or_wait(dev_idxs_1.nbytes()));
+  block_partitions.set_data(
+      allocator::malloc_or_wait(block_partitions.nbytes()));
+
+  std::vector<array> copies = {
+      dev_vals_0, dev_vals_1, dev_idxs_0, dev_idxs_1, block_partitions};
+
+  // Prepare command encoder
+  auto compute_encoder = d.get_command_encoder(s.index);
+
+  // Do blockwise sort
+  {
+    std::ostringstream kname;
+    kname << "mb_block_sort_" << type_to_name(dev_vals_0) << "_"
+          << type_to_name(dev_idxs_0) << "_bn" << bn << "_tn" << tn;
+
+    auto kernel = d.get_kernel(kname.str());
+    compute_encoder->setComputePipelineState(kernel);
+
+    set_array_buffer(compute_encoder, in, 0);
+    set_array_buffer(compute_encoder, dev_vals_0, 1);
+    set_array_buffer(compute_encoder, dev_idxs_0, 2);
+    compute_encoder->setBytes(&size_sorted_axis, sizeof(int), 3);
+    compute_encoder->setBytes(&stride_sorted_axis, sizeof(int), 4);
+    compute_encoder->setBytes(&nc_dim, sizeof(int), 5);
+    compute_encoder->setBytes(nc_shape.data(), nc_dim * sizeof(int), 6);
+    compute_encoder->setBytes(nc_str.data(), nc_dim * sizeof(size_t), 7);
+
+    MTL::Size group_dims = MTL::Size(bn, 1, 1);
+    MTL::Size grid_dims = MTL::Size(n_blocks, n_rows, 1);
+
+    compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+  }
+
+  // Do merges
+  bool ping = false;
+  array dev_vals_in = dev_vals_0;
+  array dev_idxs_in = dev_idxs_0;
+  array dev_vals_out = dev_vals_1;
+  array dev_idxs_out = dev_idxs_1;
+  for (int merge_tiles = 2; merge_tiles <= n_blocks; merge_tiles *= 2) {
+    dev_vals_in = ping ? dev_vals_1 : dev_vals_0;
+    dev_idxs_in = ping ? dev_idxs_1 : dev_idxs_0;
+    dev_vals_out = ping ? dev_vals_0 : dev_vals_1;
+    dev_idxs_out = ping ? dev_idxs_0 : dev_idxs_1;
+    ping = !ping;
+
+    // Do partiton
+    {
+      std::ostringstream kname;
+      kname << "mb_block_partiton_" << type_to_name(dev_vals_in) << "_"
+            << type_to_name(dev_idxs_in) << "_bn" << bn << "_tn" << tn;
+
+      auto kernel = d.get_kernel(kname.str());
+      compute_encoder->setComputePipelineState(kernel);
+
+      set_array_buffer(compute_encoder, block_partitions, 0);
+      set_array_buffer(compute_encoder, dev_vals_in, 1);
+      set_array_buffer(compute_encoder, dev_idxs_in, 2);
+      compute_encoder->setBytes(&size_sorted_axis, sizeof(int), 3);
+      compute_encoder->setBytes(&merge_tiles, sizeof(int), 4);
+
+      MTL::Size group_dims = MTL::Size(n_blocks + 1, 1, 1);
+      MTL::Size grid_dims = MTL::Size(1, n_rows, 1);
+
+      compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    }
+
+    // Do merge
+    {
+      std::ostringstream kname;
+      kname << "mb_block_merge_" << type_to_name(dev_vals_in) << "_"
+            << type_to_name(dev_idxs_in) << "_bn" << bn << "_tn" << tn;
+
+      auto kernel = d.get_kernel(kname.str());
+      compute_encoder->setComputePipelineState(kernel);
+
+      set_array_buffer(compute_encoder, block_partitions, 0);
+      set_array_buffer(compute_encoder, dev_vals_in, 1);
+      set_array_buffer(compute_encoder, dev_idxs_in, 2);
+      set_array_buffer(compute_encoder, dev_vals_out, 3);
+      set_array_buffer(compute_encoder, dev_idxs_out, 4);
+      compute_encoder->setBytes(&size_sorted_axis, sizeof(int), 5);
+      compute_encoder->setBytes(&merge_tiles, sizeof(int), 6);
+      compute_encoder->setBytes(&n_blocks, sizeof(int), 7);
+
+      MTL::Size group_dims = MTL::Size(bn, 1, 1);
+      MTL::Size grid_dims = MTL::Size(n_blocks, n_rows, 1);
+
+      compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
+    }
+  }
+
+  // Copy outputs with appropriate strides
+  array strided_out_arr = ARGSORT ? dev_idxs_out : dev_vals_out;
+
+  if (axis == strided_out_arr.ndim() - 1) {
+    copy_gpu_inplace(strided_out_arr, out, CopyType::Vector, s);
+  } else {
+    std::vector<int> strided_out_shape = strided_out_arr.shape();
+    std::vector<size_t> strided_out_str = strided_out_arr.strides();
+
+    int out_axis_shape = strided_out_shape[axis];
+    int out_axis_str = strided_out_str[axis];
+
+    strided_out_shape.erase(strided_out_shape.begin() + axis);
+    strided_out_str.erase(strided_out_str.begin() + axis);
+
+    strided_out_shape.push_back(out_axis_shape);
+    strided_out_str.push_back(out_axis_str);
+
+    array strided_out_slice(strided_out_shape, out.dtype(), nullptr, {});
+    strided_out_slice.copy_shared_buffer(
+        strided_out_arr,
+        strided_out_str,
+        strided_out_arr.flags(),
+        strided_out_arr.size(),
+        0);
+
+    copy_gpu_inplace(strided_out_slice, out, CopyType::General, s);
+  }
+
+  // Clear copies
+  d.get_command_buffer(s.index)->addCompletedHandler(
+      [copies](MTL::CommandBuffer*) mutable { copies.clear(); });
+}
+
+template <bool ARGSORT>
+void gpu_merge_sort(
+    const Stream& s,
+    metal::Device& d,
+    const array& in,
+    array& out,
+    int axis_) {
+  // Get size info
+  int axis = axis_ < 0 ? axis_ + in.ndim() : axis_;
+  int size_sorted_axis = in.shape(axis);
+
+  // Get kernel size
+  int tn = 8;
+  int bn = 128;
+  int potential_bn = (size_sorted_axis + tn - 1) / tn;
+
+  if (potential_bn > 256) {
+    bn = 512;
+  } else if (potential_bn > 128) {
+    bn = 256;
+  } else {
+    bn = 128;
+  }
+
+  if (bn == 512 && size_of(in.dtype()) > 4) {
+    bn = 256;
+  }
+
+  int n_per_block = bn * tn;
+  int n_blocks = (size_sorted_axis + n_per_block - 1) / n_per_block;
+
+  if (n_blocks > 1) {
+    return multi_block_sort<ARGSORT>(s, d, in, out, axis, bn, tn, n_blocks);
+  } else {
+    return single_block_sort<ARGSORT>(s, d, in, out, axis, bn, tn);
+  }
+}
+
+} // namespace
+
+void ArgSort::eval_gpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+  auto& in = inputs[0];
+
+  gpu_merge_sort<true>(s, d, in, out, axis_);
+}
+
+void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
+  assert(inputs.size() == 1);
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+  auto& in = inputs[0];
+
+  gpu_merge_sort<false>(s, d, in, out, axis_);
+}
+
+void ArgPartition::eval_gpu(const std::vector<array>& inputs, array& out) {
+  // We direct arg partition to sort for now
+  assert(inputs.size() == 1);
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+  auto& in = inputs[0];
+
+  gpu_merge_sort<true>(s, d, in, out, axis_);
+}
+
+void Partition::eval_gpu(const std::vector<array>& inputs, array& out) {
+  // We direct partition to sort for now
+  assert(inputs.size() == 1);
+
+  out.set_data(allocator::malloc_or_wait(out.nbytes()));
+
+  auto& s = stream();
+  auto& d = metal::device(s.device);
+  auto& in = inputs[0];
+
+  gpu_merge_sort<false>(s, d, in, out, axis_);
+}
+
+} // namespace mlx::core
--- a/mlx/backend/no_metal/CMakeLists.txt
+++ b/mlx/backend/no_metal/CMakeLists.txt
@ -0,0 +1,7 @@
+target_sources(
+  mlx
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
+)
--- a/mlx/backend/no_metal/primitives.cpp
+++ b/mlx/backend/no_metal/primitives.cpp
@ -0,0 +1,77 @@
+
+#include "mlx/primitives.h"
+
+#define NO_GPU(func)                                                  \
+  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
+    throw std::runtime_error(#func " has no GPU implementation.");    \
+  }
+
+namespace mlx::core {
+
+NO_GPU(Abs)
+NO_GPU(Add)
+NO_GPU(Arange)
+NO_GPU(ArcCos)
+NO_GPU(ArcCosh)
+NO_GPU(ArcSin)
+NO_GPU(ArcSinh)
+NO_GPU(ArcTan)
+NO_GPU(ArcTanh)
+NO_GPU(ArgPartition)
+NO_GPU(ArgReduce)
+NO_GPU(ArgSort)
+NO_GPU(AsType)
+NO_GPU(AsStrided)
+NO_GPU(Broadcast)
+NO_GPU(Concatenate)
+NO_GPU(Convolution)
+NO_GPU(Copy)
+NO_GPU(Cos)
+NO_GPU(Cosh)
+NO_GPU(Divide)
+NO_GPU(Equal)
+NO_GPU(Erf)
+NO_GPU(ErfInv)
+NO_GPU(Exp)
+NO_GPU(FFT)
+NO_GPU(Full)
+NO_GPU(Gather)
+NO_GPU(Greater)
+NO_GPU(GreaterEqual)
+NO_GPU(Less)
+NO_GPU(LessEqual)
+NO_GPU(Load)
+NO_GPU(Log)
+NO_GPU(Log1p)
+NO_GPU(LogicalNot)
+NO_GPU(LogAddExp)
+NO_GPU(Matmul)
+NO_GPU(Maximum)
+NO_GPU(Minimum)
+NO_GPU(Multiply)
+NO_GPU(Negative)
+NO_GPU(NotEqual)
+NO_GPU(Pad)
+NO_GPU(Partition)
+NO_GPU(Power)
+NO_GPU(RandomBits)
+NO_GPU(Reduce)
+NO_GPU(Reshape)
+NO_GPU(Scan)
+NO_GPU(Scatter)
+NO_GPU(Sigmoid)
+NO_GPU(Sign)
+NO_GPU(Sin)
+NO_GPU(Sinh)
+NO_GPU(Slice)
+NO_GPU(Softmax)
+NO_GPU(Sort)
+NO_GPU(Square)
+NO_GPU(Sqrt)
+NO_GPU(StopGradient)
+NO_GPU(Subtract)
+NO_GPU(Tan)
+NO_GPU(Tanh)
+NO_GPU(Transpose)
+
+} // namespace mlx::core
--- a/mlx/device.cpp
+++ b/mlx/device.cpp
@ -0,0 +1,29 @@
+#include "mlx/device.h"
+#include "mlx/backend/metal/metal.h"
+
+namespace mlx::core {
+
+static Device default_device_{
+    metal::is_available() ? Device::gpu : Device::cpu};
+
+const Device& default_device() {
+  return default_device_;
+}
+
+void set_default_device(const Device& d) {
+  if (!metal::is_available() && d == Device::gpu) {
+    throw std::invalid_argument(
+        "[set_default_device] Cannot set gpu device without gpu backend.");
+  }
+  default_device_ = d;
+}
+
+bool operator==(const Device& lhs, const Device& rhs) {
+  return lhs.type == rhs.type && lhs.index == rhs.index;
+}
+
+bool operator!=(const Device& lhs, const Device& rhs) {
+  return !(lhs == rhs);
+}
+
+} // namespace mlx::core
--- a/mlx/dtype.h
+++ b/mlx/dtype.h
@ -0,0 +1,99 @@
+#pragma once
+
+#include <complex>
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+#include "mlx/types/complex.h"
+#include "mlx/types/half_types.h"
+
+namespace mlx::core {
+
+struct Dtype {
+  enum class Val {
+    bool_,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    int8,
+    int16,
+    int32,
+    int64,
+    float16,
+    float32,
+    bfloat16,
+    complex64,
+  };
+
+  enum class Kind {
+    b, /* bool */
+    u, /* unsigned int */
+    i, /* signed int */
+    f, /* float */
+    c, /* complex */
+    V, /* void - used for brain float */
+  };
+
+  Val val;
+  const uint8_t size;
+  constexpr explicit Dtype(Val val, uint8_t size) : val(val), size(size){};
+  constexpr operator Val() const {
+    return val;
+  };
+};
+
+inline bool is_available(const Dtype& dtype) {
+  return true;
+}
+
+static constexpr Dtype bool_{Dtype::Val::bool_, sizeof(bool)};
+
+static constexpr Dtype uint8{Dtype::Val::uint8, sizeof(uint8_t)};
+static constexpr Dtype uint16{Dtype::Val::uint16, sizeof(uint16_t)};
+static constexpr Dtype uint32{Dtype::Val::uint32, sizeof(uint32_t)};
+static constexpr Dtype uint64{Dtype::Val::uint64, sizeof(uint64_t)};
+
+static constexpr Dtype int8{Dtype::Val::int8, sizeof(int8_t)};
+static constexpr Dtype int16{Dtype::Val::int16, sizeof(int16_t)};
+static constexpr Dtype int32{Dtype::Val::int32, sizeof(int32_t)};
+static constexpr Dtype int64{Dtype::Val::int64, sizeof(int64_t)};
+
+static constexpr Dtype float16{Dtype::Val::float16, sizeof(uint16_t)};
+static constexpr Dtype float32{Dtype::Val::float32, sizeof(float)};
+static constexpr Dtype bfloat16{Dtype::Val::bfloat16, sizeof(uint16_t)};
+static constexpr Dtype complex64{Dtype::Val::complex64, sizeof(complex64_t)};
+
+Dtype promote_types(const Dtype& t1, const Dtype& t2);
+
+inline uint8_t size_of(const Dtype& t) {
+  return t.size;
+}
+
+Dtype::Kind kindof(const Dtype& t);
+
+inline bool is_unsigned(const Dtype& t) {
+  return kindof(t) == Dtype::Kind::u || kindof(t) == Dtype::Kind::b;
+}
+
+inline bool is_floating_point(const Dtype& t) {
+  return kindof(t) == Dtype::Kind::f || kindof(t) == Dtype::Kind::V ||
+      kindof(t) == Dtype::Kind::c;
+}
+
+inline bool is_integral(const Dtype& t) {
+  return !(is_floating_point(t));
+}
+
+template <typename T>
+struct TypeToDtype {
+  operator Dtype();
+};
+
+// Array protocol typestring for Dtype
+std::string dtype_to_array_protocol(const Dtype& t);
+// Dtype from array protocol type string
+Dtype dtype_from_array_protocol(const std::string& t);
+
+} // namespace mlx::core
--- a/mlx/fft.cpp
+++ b/mlx/fft.cpp
@ -0,0 +1,190 @@
+#include <numeric>
+#include <set>
+
+#include "mlx/fft.h"
+#include "mlx/ops.h"
+#include "mlx/primitives.h"
+#include "mlx/utils.h"
+
+namespace mlx::core::fft {
+
+array fft_impl(
+    const array& a,
+    std::vector<int> n,
+    const std::vector<int>& axes,
+    bool real,
+    bool inverse,
+    StreamOrDevice s) {
+  if (a.ndim() < 1) {
+    throw std::invalid_argument(
+        "[fftn] Requires array with at least one dimension.");
+  }
+  if (n.size() != axes.size()) {
+    throw std::invalid_argument("[fftn] Shape and axes have different sizes.");
+  }
+  if (axes.empty()) {
+    return a;
+  }
+
+  std::vector<size_t> valid_axes;
+  for (int ax : axes) {
+    valid_axes.push_back(ax < 0 ? ax + a.ndim() : ax);
+  }
+  std::set<int> unique_axes(valid_axes.begin(), valid_axes.end());
+  if (unique_axes.size() != axes.size()) {
+    std::ostringstream msg;
+    msg << "[fftn] Duplicated axis received " << axes;
+    throw std::invalid_argument(msg.str());
+  }
+  if (*unique_axes.begin() < 0 || *unique_axes.rbegin() >= a.ndim()) {
+    std::ostringstream msg;
+    msg << "[fftn] Invalid axis received for array with " << a.ndim()
+        << " dimensions.";
+    throw std::invalid_argument(msg.str());
+  }
+
+  // In the following shape manipulations there are three cases to consdier:
+  // 1. In a complex to complex transform (fftn / ifftn) the output
+  //    and input shapes are the same.
+  // 2. In a real to complex transform (rfftn) n specifies the input dims
+  //    and the output dims are n[i] / 2 + 1
+  // 3  In a complex to real transform (irfftn) n specifies the output dims
+  //    and the input dims are n[i] / 2 + 1
+
+  if (std::any_of(n.begin(), n.end(), [](auto i) { return i <= 0; })) {
+    std::ostringstream msg;
+    msg << "[fftn] Invalid FFT output size requested " << n;
+    throw std::invalid_argument(msg.str());
+  }
+
+  std::vector<int> in_shape = a.shape();
+  for (int i = 0; i < valid_axes.size(); ++i) {
+    in_shape[valid_axes[i]] = n[i];
+  }
+  if (real && inverse) {
+    in_shape[valid_axes.back()] = n.back() / 2 + 1;
+  }
+
+  bool any_greater = false;
+  bool any_less = false;
+  for (int i = 0; i < in_shape.size(); ++i) {
+    any_greater |= in_shape[i] > a.shape()[i];
+    any_less |= in_shape[i] < a.shape()[i];
+  }
+
+  auto in = a;
+  if (any_less) {
+    in = slice(in, std::vector<int>(in.ndim(), 0), in_shape, s);
+  }
+  if (any_greater) {
+    // Pad with zeros
+    auto tmp = zeros(in_shape, a.dtype(), s);
+    in = scatter(tmp, std::vector<array>{}, in, std::vector<int>{}, s);
+  }
+
+  auto out_shape = in_shape;
+  if (real) {
+    auto ax = valid_axes.back();
+    out_shape[ax] = inverse ? n.back() : out_shape[ax] / 2 + 1;
+  }
+
+  auto in_type = real && !inverse ? float32 : complex64;
+  auto out_type = real && inverse ? float32 : complex64;
+  return array(
+      out_shape,
+      out_type,
+      std::make_unique<FFT>(to_stream(s), valid_axes, inverse, real),
+      {astype(in, in_type, s)});
+}
+
+array fft_impl(
+    const array& a,
+    const std::vector<int>& axes,
+    bool real,
+    bool inverse,
+    StreamOrDevice s) {
+  std::vector<int> n;
+  for (auto ax : axes) {
+    n.push_back(a.shape(ax));
+  }
+  if (real && inverse) {
+    n.back() = (n.back() - 1) * 2;
+  }
+  return fft_impl(a, n, axes, real, inverse, s);
+}
+
+array fft_impl(const array& a, bool real, bool inverse, StreamOrDevice s) {
+  std::vector<int> axes(a.ndim());
+  std::iota(axes.begin(), axes.end(), 0);
+  return fft_impl(a, axes, real, inverse, s);
+}
+
+array fftn(
+    const array& a,
+    const std::vector<int>& n,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, n, axes, false, false, s);
+}
+array fftn(
+    const array& a,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, axes, false, false, s);
+}
+array fftn(const array& a, StreamOrDevice s /* = {} */) {
+  return fft_impl(a, false, false, s);
+}
+
+array ifftn(
+    const array& a,
+    const std::vector<int>& n,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, n, axes, false, true, s);
+}
+array ifftn(
+    const array& a,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, axes, false, true, s);
+}
+array ifftn(const array& a, StreamOrDevice s /* = {} */) {
+  return fft_impl(a, false, true, s);
+}
+
+array rfftn(
+    const array& a,
+    const std::vector<int>& n,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, n, axes, true, false, s);
+}
+array rfftn(
+    const array& a,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, axes, true, false, s);
+}
+array rfftn(const array& a, StreamOrDevice s /* = {} */) {
+  return fft_impl(a, true, false, s);
+}
+
+array irfftn(
+    const array& a,
+    const std::vector<int>& n,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, n, axes, true, true, s);
+}
+array irfftn(
+    const array& a,
+    const std::vector<int>& axes,
+    StreamOrDevice s /* = {} */) {
+  return fft_impl(a, axes, true, true, s);
+}
+array irfftn(const array& a, StreamOrDevice s /* = {} */) {
+  return fft_impl(a, true, true, s);
+}
+
+} // namespace mlx::core::fft
--- a/mlx/graph_utils.h
+++ b/mlx/graph_utils.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include "mlx/array.h"
+
+namespace mlx::core {
+
+void print_graph(std::ostream& os, const std::vector<array>& outputs);
+
+template <typename... Arrays>
+void print_graph(std::ostream& os, Arrays... outputs) {
+  print_graph(os, std::vector<array>{std::forward<Arrays>(outputs)...});
+}
+
+void export_to_dot(std::ostream& os, const std::vector<array>& outputs);
+
+template <typename... Arrays>
+void export_to_dot(std::ostream& os, Arrays... outputs) {
+  export_to_dot(os, std::vector<array>{std::forward<Arrays>(outputs)...});
+}
+
+} // namespace mlx::core
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
--- a/mlx/transforms_impl.h
+++ b/mlx/transforms_impl.h
@ -0,0 +1,16 @@
+
+namespace mlx::core::detail {
+
+std::pair<std::vector<array>, std::vector<array>> vmap_trace(
+    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
+    const std::vector<array>& inputs,
+    const std::vector<int>& in_axes);
+
+std::vector<array> vmap_replace(
+    const std::vector<array>& inputs,
+    const std::vector<array>& s_inputs,
+    const std::vector<array>& s_outputs,
+    const std::vector<int>& in_axes,
+    const std::vector<int>& out_axes);
+
+} // namespace mlx::core::detail
--- a/mlx/types/complex.h
+++ b/mlx/types/complex.h
@ -0,0 +1,75 @@
+#pragma once
+#include <complex>
+#include "mlx/types/half_types.h"
+
+namespace mlx::core {
+
+struct complex64_t;
+
+template <typename T>
+static constexpr bool can_convert_to_complex64 =
+    !std::is_same_v<T, complex64_t> && std::is_convertible_v<T, float>;
+
+struct complex64_t : public std::complex<float> {
+  complex64_t(float v, float u) : std::complex<float>(v, u){};
+  complex64_t(std::complex<float> v) : std::complex<float>(v){};
+
+  template <
+      typename T,
+      typename = typename std::enable_if<can_convert_to_complex64<T>>::type>
+  complex64_t(T x) : std::complex<float>(x){};
+
+  operator float() const {
+    return real();
+  };
+};
+
+inline bool operator>=(const complex64_t& a, const complex64_t& b) {
+  return (a.real() > b.real()) ||
+      (a.real() == b.real() && a.imag() >= b.imag());
+}
+
+inline bool operator>(const complex64_t& a, const complex64_t& b) {
+  return (a.real() > b.real()) || (a.real() == b.real() && a.imag() > b.imag());
+}
+
+inline bool operator<=(const complex64_t& a, const complex64_t& b) {
+  return operator>=(b, a);
+}
+
+inline bool operator<(const complex64_t& a, const complex64_t& b) {
+  return operator>(b, a);
+}
+
+inline complex64_t operator-(const complex64_t& v) {
+  return -static_cast<std::complex<float>>(v);
+}
+
+// clang-format off
+#define complex_binop_helper(_op_, _operator_, itype)            \
+  inline complex64_t _operator_(itype x, const complex64_t& y) { \
+    return x _op_ static_cast<std::complex<float>>(y);           \
+  }                                                              \
+  inline complex64_t _operator_(const complex64_t& x, itype y) { \
+    return static_cast<std::complex<float>>(x) _op_ y;           \
+  }
+
+#define complex_binop(_op_, _operator_)                                       \
+  inline complex64_t _operator_(const complex64_t& x, const complex64_t& y) { \
+    return static_cast<std::complex<float>>(x)                                \
+        _op_ static_cast<std::complex<float>>(y);                             \
+  }                                                                           \
+  complex_binop_helper(_op_, _operator_, bool)                                \
+  complex_binop_helper(_op_, _operator_, uint32_t)                            \
+  complex_binop_helper(_op_, _operator_, uint64_t)                            \
+  complex_binop_helper(_op_, _operator_, int32_t)                             \
+  complex_binop_helper(_op_, _operator_, int64_t)                             \
+  complex_binop_helper(_op_, _operator_, float16_t)                           \
+  complex_binop_helper(_op_, _operator_, bfloat16_t)                          \
+  complex_binop_helper(_op_, _operator_, const std::complex<float>&)          \
+  complex_binop_helper(_op_, _operator_, float)
+// clang-format on
+
+complex_binop(+, operator+)
+
+} // namespace mlx::core
--- a/mlx/types/fp16.h
+++ b/mlx/types/fp16.h
@ -0,0 +1,232 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#define __MLX_HALF_NAN__ 0x7D00
+
+namespace mlx::core {
+
+namespace {
+union float_bits_fp16 {
+  float f;
+  uint32_t u;
+};
+} // namespace
+
+struct _MLX_Float16 {
+  uint16_t bits_;
+
+  // Default constructor
+  _MLX_Float16() = default;
+
+  // Default copy constructor
+  _MLX_Float16(_MLX_Float16 const&) = default;
+
+  // Appease std::vector<bool> for being special
+  _MLX_Float16& operator=(std::vector<bool>::reference x) {
+    bits_ = x;
+    return *this;
+  }
+
+  _MLX_Float16& operator=(const float& x) {
+    return (*this = _MLX_Float16(x));
+  }
+
+  // From float32
+  _MLX_Float16(const float& x) : bits_(0) {
+    // Conversion following
+    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
+
+    // Union
+    float_bits_fp16 in;
+
+    // Take fp32 bits
+    in.f = x;
+
+    // Find and take sign bit
+    uint32_t x_sign_32 = in.u & uint32_t(0x80000000);
+    uint16_t x_sign_16 = (x_sign_32 >> 16);
+
+    if (std::isnan(x)) {
+      bits_ = x_sign_16 | uint16_t(__MLX_HALF_NAN__);
+    } else {
+      // Union
+      float_bits_fp16 inf_scale, zero_scale, magic_bits;
+
+      // Find exponent bits and take the max supported by half
+      uint32_t x_expo_32 = in.u & uint32_t(0x7f800000);
+      uint32_t max_expo_32 = uint32_t(0x38800000);
+      x_expo_32 = x_expo_32 < max_expo_32 ? max_expo_32 : x_expo_32;
+      x_expo_32 += uint32_t(15) << 23;
+
+      // Handle scaling to inf as needed
+      inf_scale.u = uint32_t(0x77800000);
+      zero_scale.u = uint32_t(0x08800000);
+
+      // Combine with magic and let addition do rouding
+      magic_bits.u = x_expo_32;
+      magic_bits.f += (std::abs(x) * inf_scale.f) * zero_scale.f;
+
+      // Take the lower 5 bits of the exponent
+      uint32_t x_expo_16 = ((magic_bits.u >> 13) & uint32_t(0x7c00));
+
+      // Collect the lower 12 bits which have the mantissa
+      uint32_t x_mant_16 = magic_bits.u & uint32_t(0x0fff);
+
+      // Combine sign, exp and mantissa
+      bits_ = (x_sign_16 | uint16_t(x_expo_16 + x_mant_16));
+    }
+  }
+
+  // To float32
+  operator float() const {
+    // Conversion following
+    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
+
+    // Union
+    float_bits_fp16 out;
+
+    uint32_t x_sign_32 = (bits_ << 16) & uint32_t(0x80000000);
+    uint32_t base = (bits_ << 16);
+    uint32_t two_base = base + base;
+
+    uint32_t denorm_max = 1u << 27;
+    if (two_base < denorm_max) {
+      out.u = uint32_t(126) << 23; // magic mask
+      out.u |= (two_base >> 17); // Bits from fp16
+      out.f -= 0.5f; // magic bias
+    } else {
+      out.u = uint32_t(0xE0) << 23; // exponent offset
+      out.u += (two_base >> 4); // Bits from fp16
+      float out_unscaled = out.f; // Store value
+      out.u = uint32_t(0x7800000); // exponent scale
+      out.f *= out_unscaled;
+    }
+
+    // Add sign
+    out.u |= x_sign_32;
+
+    return out.f;
+  }
+};
+
+#define half_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
+  inline otype __operator__(atype lhs, btype rhs) {                       \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }
+
+#define half_binop_helper(__op__, __operator__, otype, itype, ctype) \
+  inline otype __operator__(_MLX_Float16 lhs, itype rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
+  }                                                                  \
+  inline otype __operator__(itype lhs, _MLX_Float16 rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
+  }
+
+// Operators
+#define half_binop(__op__, __operator__)                                      \
+  half_binop_base(                                                            \
+      __op__, __operator__, _MLX_Float16, _MLX_Float16, _MLX_Float16, float); \
+  half_binop_helper(__op__, __operator__, float, float, float);               \
+  half_binop_helper(__op__, __operator__, double, double, double);            \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, bool, float);         \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, int32_t, float);      \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, uint32_t, float);     \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, int64_t, float);      \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, uint64_t, float);
+
+half_binop(+, operator+);
+half_binop(-, operator-);
+half_binop(*, operator*);
+half_binop(/, operator/);
+
+#undef half_binop
+
+// Comparison ops
+#define half_compop(__op__, __operator__)                             \
+  half_binop_base(                                                    \
+      __op__, __operator__, bool, _MLX_Float16, _MLX_Float16, float); \
+  half_binop_helper(__op__, __operator__, bool, float, float);        \
+  half_binop_helper(__op__, __operator__, bool, double, double);      \
+  half_binop_helper(__op__, __operator__, bool, int32_t, float);      \
+  half_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
+  half_binop_helper(__op__, __operator__, bool, int64_t, float);      \
+  half_binop_helper(__op__, __operator__, bool, uint64_t, float);
+
+half_compop(>, operator>);
+half_compop(<, operator<);
+half_compop(>=, operator>=);
+half_compop(<=, operator<=);
+half_compop(==, operator==);
+half_compop(!=, operator!=);
+
+#undef half_compop
+
+// Negative
+inline _MLX_Float16 operator-(_MLX_Float16 lhs) {
+  return -static_cast<float>(lhs);
+}
+
+// Inplace ops
+#define half_inplace_op(__op__, __operator__)                              \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, const float& rhs) { \
+    lhs = lhs __op__ rhs;                                                  \
+    return lhs;                                                            \
+  }                                                                        \
+  inline float& __operator__(float& lhs, _MLX_Float16 rhs) {               \
+    lhs = lhs __op__ rhs;                                                  \
+    return lhs;                                                            \
+  }
+
+half_inplace_op(+, operator+=);
+half_inplace_op(-, operator-=);
+half_inplace_op(*, operator*=);
+half_inplace_op(/, operator/=);
+
+#undef half_inplace_op
+
+// Bitwise ops
+
+#define half_bitop(__op__, __operator__)                                 \
+  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, _MLX_Float16 rhs) { \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs.bits_ __op__ rhs.bits_;                              \
+    return out;                                                          \
+  }                                                                      \
+  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, uint16_t rhs) {     \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs.bits_ __op__ rhs;                                    \
+    return out;                                                          \
+  }                                                                      \
+  inline _MLX_Float16 __operator__(uint16_t lhs, _MLX_Float16 rhs) {     \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs __op__ rhs.bits_;                                    \
+    return out;                                                          \
+  }
+
+half_bitop(|, operator|);
+half_bitop(&, operator&);
+half_bitop(^, operator^);
+
+#undef half_bitop
+
+#define half_inplace_bitop(__op__, __operator__)                           \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, _MLX_Float16 rhs) { \
+    lhs.bits_ = lhs.bits_ __op__ rhs.bits_;                                \
+    return lhs;                                                            \
+  }                                                                        \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, uint16_t rhs) {     \
+    lhs.bits_ = lhs.bits_ __op__ rhs;                                      \
+    return lhs;                                                            \
+  }
+
+half_inplace_bitop(|, operator|=);
+half_inplace_bitop(&, operator&=);
+half_inplace_bitop(^, operator^=);
+
+#undef half_inplace_bitop
+
+} // namespace mlx::core
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "pybind11>=2.10", "cmake>=3.24"]
+build-backend = "setuptools.build_meta"
--- a/python/mlx/nn/layers/convolution.py
+++ b/python/mlx/nn/layers/convolution.py
@ -0,0 +1,124 @@
+import math
+from typing import Union
+
+import mlx.core as mx
+from mlx.nn.layers.base import Module
+
+
+class Conv1d(Module):
+    """Applies a 1-dimensional convolution over the multi-channel input sequence.
+
+    The channels are expected to be last i.e. the input shape should be ``NLC`` where:
+        - ``N`` is the batch dimension
+        - ``L`` is the sequence length
+        - ``C`` is the number of input channels
+
+    Args:
+        in_channels (int): The number of input channels
+        out_channels (int): The number of output channels
+        kernel_size (int): The size of the convolution filters
+        stride (int, optional): The stride when applying the filter.
+            Default: 1.
+        padding (int, optional): How many positions to 0-pad the input with.
+            Default: 0.
+        bias (bool, optional): If ``True`` add a learnable bias to the output.
+            Default: ``True``
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = True,
+    ):
+        super().__init__()
+
+        scale = math.sqrt(1 / (in_channels * kernel_size))
+        self.weight = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, kernel_size, in_channels),
+        )
+        if bias:
+            self.bias = mx.zeros((out_channels,))
+
+        self.padding = padding
+        self.stride = stride
+
+    def _extra_repr(self):
+        return (
+            f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
+            f"kernel_size={self.weight.shape[1]}, stride={self.stride}, "
+            f"padding={self.padding}, bias={'bias' in self}"
+        )
+
+    def __call__(self, x):
+        y = mx.conv1d(x, self.weight, self.stride, self.padding)
+        if "bias" in self:
+            y = y + self.bias
+        return y
+
+
+class Conv2d(Module):
+    """Applies a 2-dimensional convolution over the multi-channel input image.
+
+    The channels are expected to be last i.e. the input shape should be ``NHWC`` where:
+        - ``N`` is the batch dimension
+        - ``H`` is the input image height
+        - ``W`` is the input image width
+        - ``C`` is the number of input channels
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (int or tuple): The size of the convolution filters.
+        stride (int or tuple, optional): The size of the stride when
+            applying the filter. Default: 0.
+        padding (int or tuple, optional): How many positions to 0-pad
+            the input with. Default: 0.
+        bias (bool, optional): If ``True`` add a learnable bias to the
+            output. Default: ``True``
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, tuple],
+        stride: Union[int, tuple] = 1,
+        padding: Union[int, tuple] = 0,
+        bias: bool = True,
+    ):
+        super().__init__()
+
+        kernel_size, stride, padding = map(
+            lambda x: (x, x) if isinstance(x, int) else x,
+            (kernel_size, stride, padding),
+        )
+        scale = math.sqrt(1 / (in_channels * kernel_size[0] * kernel_size[1]))
+        self.weight = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(out_channels, *kernel_size, in_channels),
+        )
+        if bias:
+            self.bias = mx.zeros((out_channels,))
+
+        self.padding = padding
+        self.stride = stride
+
+    def _extra_repr(self):
+        return (
+            f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
+            f"kernel_size={self.weight.shape[1:2]}, stride={self.stride}, "
+            f"padding={self.padding}, bias={'bias' in self}"
+        )
+
+    def __call__(self, x):
+        y = mx.conv2d(x, self.weight, self.stride, self.padding)
+        if "bias" in self:
+            y = y + self.bias
+        return y
--- a/python/mlx/nn/layers/embedding.py
+++ b/python/mlx/nn/layers/embedding.py
@ -0,0 +1,28 @@
+import math
+
+import mlx.core as mx
+from mlx.nn.layers.base import Module
+
+
+class Embedding(Module):
+    """Implements a simple lookup table that maps each input integer to a
+    high-dimensional vector.
+
+    Typically used to embed discrete tokens for processing by neural networks.
+
+    Args:
+        num_embeddings (int): How many possible discrete tokens can we embed.
+                              Usually called the vocabulary size.
+        dims (int): The dimensionality of the embeddings.
+    """
+
+    def __init__(self, num_embeddings: int, dims: int):
+        super().__init__()
+        scale = math.sqrt(1 / dims)
+        self.weight = mx.random.normal((num_embeddings, dims)) * scale
+
+    def _extra_repr(self):
+        return f"{self.weight.shape[0]}, {self.weight.shape[1]}"
+
+    def __call__(self, x):
+        return self.weight[x]
--- a/python/mlx/nn/layers/linear.py
+++ b/python/mlx/nn/layers/linear.py
@ -0,0 +1,34 @@
+import math
+
+import mlx.core as mx
+from mlx.nn.layers.base import Module
+
+
+class Linear(Module):
+    """Applies an affine transformation to the input.
+
+    Args:
+        input_dims (int): The dimensionality of the input features
+        output_dims (int): The dimensionality of the output features
+        bias (bool): If set to False then the layer will not use a bias
+    """
+
+    def __init__(self, input_dims: int, output_dims: int, bias: bool = True):
+        super().__init__()
+        scale = math.sqrt(1 / input_dims)
+        self.weight = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(output_dims, input_dims),
+        )
+        if bias:
+            self.bias = mx.zeros((output_dims,))
+
+    def _extra_repr(self):
+        return f"input_dims={self.weight.shape[1]}, output_dims={self.weight.shape[0]}, bias={'bias' in self}"
+
+    def __call__(self, x):
+        x = x @ self.weight.T
+        if "bias" in self:
+            x = x + self.bias
+        return x
--- a/python/src/load.h
+++ b/python/src/load.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <unordered_map>
+#include <variant>
+#include "mlx/ops.h"
+
+namespace py = pybind11;
+using namespace mlx::core;
+
+using DictOrArray = std::variant<array, std::unordered_map<std::string, array>>;
+
+DictOrArray mlx_load_helper(py::object file, StreamOrDevice s);
+void mlx_save_helper(py::object file, array a, bool retain_graph = true);
+void mlx_savez_helper(
+    py::object file,
+    py::args args,
+    const py::kwargs& kwargs,
+    bool compressed = false);
--- a/python/src/mlx.cpp
+++ b/python/src/mlx.cpp
@ -0,0 +1,31 @@
+#include <pybind11/pybind11.h>
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+namespace py = pybind11;
+
+void init_array(py::module_&);
+void init_device(py::module_&);
+void init_stream(py::module_&);
+void init_metal(py::module_&);
+void init_ops(py::module_&);
+void init_transforms(py::module_&);
+void init_random(py::module_&);
+void init_fft(py::module_&);
+
+PYBIND11_MODULE(core, m) {
+  m.doc() = "mlx: A framework for machine learning on Apple Silicon.";
+
+  auto reprlib_fix = py::module_::import("mlx._reprlib_fix");
+
+  init_device(m);
+  init_stream(m);
+  init_array(m);
+  init_metal(m);
+  init_ops(m);
+  init_transforms(m);
+  init_random(m);
+  init_fft(m);
+  m.attr("__version__") = TOSTRING(_VERSION_);
+}
--- a/python/src/transforms.cpp
+++ b/python/src/transforms.cpp
@ -0,0 +1,723 @@
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <algorithm>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+
+#include "mlx/array.h"
+#include "mlx/graph_utils.h"
+#include "mlx/transforms.h"
+#include "mlx/transforms_impl.h"
+
+namespace py = pybind11;
+using namespace py::literals;
+using namespace mlx::core;
+
+using IntOrVec = std::variant<int, std::vector<int>>;
+using StrOrVec = std::variant<std::string, std::vector<std::string>>;
+
+template <typename T>
+std::vector<T> to_vector(const std::variant<T, std::vector<T>>& v) {
+  std::vector<T> vals;
+  if (auto pv = std::get_if<T>(&v); pv) {
+    vals.push_back(*pv);
+  } else {
+    vals = std::get<std::vector<T>>(v);
+  }
+  return vals;
+}
+
+void tree_visit(py::object tree, std::function<void(py::handle)> visitor) {
+  std::function<void(py::handle)> recurse;
+  recurse = [&](py::handle subtree) {
+    if (py::isinstance<py::list>(subtree) ||
+        py::isinstance<py::tuple>(subtree)) {
+      for (auto item : subtree) {
+        recurse(item);
+      }
+    } else if (py::isinstance<py::dict>(subtree)) {
+      for (auto item : py::cast<py::dict>(subtree)) {
+        recurse(item.second);
+      }
+    } else {
+      visitor(subtree);
+    }
+  };
+
+  recurse(tree);
+}
+
+template <typename T, typename U, typename V>
+void validate_subtrees(const std::vector<py::object>& subtrees) {
+  int len = py::cast<T>(subtrees[0]).size();
+  for (auto& subtree : subtrees) {
+    if ((py::isinstance<T>(subtree) && py::cast<T>(subtree).size() != len) ||
+        py::isinstance<U>(subtree) || py::isinstance<V>(subtree)) {
+      throw std::invalid_argument(
+          "[tree_map] Additional input tree is not a valid prefix of the first tree.");
+    }
+  }
+}
+
+py::object tree_map(
+    const std::vector<py::object>& trees,
+    std::function<py::object(const std::vector<py::object>&)> transform) {
+  std::function<py::object(const std::vector<py::object>&)> recurse;
+
+  recurse = [&](const std::vector<py::object>& subtrees) {
+    if (py::isinstance<py::list>(subtrees[0])) {
+      py::list l;
+      std::vector<py::object> items(subtrees.size());
+      validate_subtrees<py::list, py::tuple, py::dict>(subtrees);
+      for (int i = 0; i < py::cast<py::list>(subtrees[0]).size(); ++i) {
+        for (int j = 0; j < subtrees.size(); ++j) {
+          if (py::isinstance<py::list>(subtrees[j])) {
+            items[j] = py::cast<py::list>(subtrees[j])[i];
+          } else {
+            items[j] = subtrees[j];
+          }
+        }
+        l.append(recurse(items));
+      }
+      return py::cast<py::object>(l);
+    } else if (py::isinstance<py::tuple>(subtrees[0])) {
+      //  Check the rest of the subtrees
+      std::vector<py::object> items(subtrees.size());
+      int len = py::cast<py::tuple>(subtrees[0]).size();
+      py::tuple l(len);
+      validate_subtrees<py::tuple, py::list, py::dict>(subtrees);
+      for (int i = 0; i < len; ++i) {
+        for (int j = 0; j < subtrees.size(); ++j) {
+          if (py::isinstance<py::tuple>(subtrees[j])) {
+            items[j] = py::cast<py::tuple>(subtrees[j])[i];
+          } else {
+            items[j] = subtrees[j];
+          }
+        }
+        l[i] = recurse(items);
+      }
+      return py::cast<py::object>(l);
+    } else if (py::isinstance<py::dict>(subtrees[0])) {
+      std::vector<py::object> items(subtrees.size());
+      validate_subtrees<py::dict, py::list, py::tuple>(subtrees);
+      py::dict d;
+      for (auto item : py::cast<py::dict>(subtrees[0])) {
+        for (int j = 0; j < subtrees.size(); ++j) {
+          if (py::isinstance<py::dict>(subtrees[j])) {
+            auto subdict = py::cast<py::dict>(subtrees[j]);
+            if (!subdict.contains(item.first)) {
+              throw std::invalid_argument(
+                  "[tree_map] Tree is not a valid prefix tree of the first tree.");
+            }
+            items[j] = subdict[item.first];
+          } else {
+            items[j] = subtrees[j];
+          }
+        }
+        d[item.first] = recurse(items);
+      }
+      return py::cast<py::object>(d);
+    } else {
+      return transform(subtrees);
+    }
+  };
+  return recurse(trees);
+}
+
+py::object tree_map(
+    py::object tree,
+    std::function<py::object(py::handle)> transform) {
+  return tree_map({tree}, [&](std::vector<py::object> inputs) {
+    return transform(inputs[0]);
+  });
+}
+
+std::vector<array> tree_flatten(py::object tree, bool strict = true) {
+  std::vector<array> flat_tree;
+
+  tree_visit(tree, [&](py::handle obj) {
+    if (py::isinstance<array>(obj)) {
+      flat_tree.push_back(py::cast<array>(obj));
+    } else if (strict) {
+      throw std::invalid_argument("Argument is not an array");
+    }
+  });
+
+  return flat_tree;
+}
+
+py::object tree_unflatten(
+    py::object tree,
+    const std::vector<array>& values,
+    int index = 0) {
+  return tree_map(tree, [&](py::handle obj) {
+    if (py::isinstance<array>(obj)) {
+      return py::cast(values[index++]);
+    } else {
+      return py::cast<py::object>(obj);
+    }
+  });
+}
+
+auto validate_argnums_argnames(
+    const std::optional<IntOrVec>& argnums,
+    const StrOrVec& argnames) {
+  auto vec_names = to_vector(argnames);
+
+  if (!argnums.has_value()) {
+    // argnums was not provided and argnames was empty
+    if (vec_names.empty()) {
+      return std::make_pair(std::vector<int>{0}, vec_names);
+    } else {
+      return std::make_pair(std::vector<int>{}, vec_names);
+    }
+  }
+
+  return std::make_pair(to_vector(*argnums), vec_names);
+}
+
+auto py_value_and_grad(
+    const py::function& fun,
+    std::vector<int> argnums,
+    std::vector<std::string> argnames,
+    const std::string& error_msg_tag,
+    bool scalar_func_only) {
+  // Sanitize argnums
+  if (argnums.size() == 0 && argnames.size() == 0) {
+    throw std::invalid_argument(
+        error_msg_tag + " Gradient wrt no argument requested");
+  }
+  if (argnums.size() > 0) {
+    std::sort(argnums.begin(), argnums.end());
+    if (argnums[0] < 0) {
+      std::ostringstream msg;
+      msg << error_msg_tag
+          << " Can't compute the gradient of negative argument index "
+          << argnums[0];
+      throw std::invalid_argument(msg.str());
+    }
+  }
+
+  return [fun, argnums, argnames, error_msg_tag, scalar_func_only](
+             const py::args& args, const py::kwargs& kwargs) {
+    // Sanitize the input
+    if (argnums.size() > 0 && argnums.back() >= args.size()) {
+      std::ostringstream msg;
+      msg << error_msg_tag << " Can't compute the gradient of argument index "
+          << argnums.back() << " because the function is called with only "
+          << args.size() << " arguments.";
+      throw std::invalid_argument(msg.str());
+    }
+
+    for (auto& key : argnames) {
+      if (!kwargs.contains(key)) {
+        std::ostringstream msg;
+        msg << error_msg_tag
+            << " Can't compute the gradient of keyword argument '" << key
+            << "' because the function is called with the "
+            << "following keyword arguments {";
+        for (auto item : kwargs) {
+          msg << item.first.cast<std::string>() << ",";
+        }
+        msg << "}";
+        throw std::invalid_argument(msg.str());
+      }
+    }
+
+    // Collect the arrays
+    std::vector<array> arrays;
+    std::vector<int> counts(1, 0);
+    for (auto i : argnums) {
+      auto argsi = tree_flatten(args[i]);
+      arrays.insert(arrays.end(), argsi.begin(), argsi.end());
+      counts.push_back(argsi.size());
+    }
+    for (auto& key : argnames) {
+      auto argsk = tree_flatten(kwargs[key.c_str()]);
+      arrays.insert(arrays.end(), argsk.begin(), argsk.end());
+      counts.push_back(argsk.size());
+    }
+    std::partial_sum(counts.cbegin(), counts.cend(), counts.begin());
+    std::vector<int> gradient_indices(arrays.size());
+    std::iota(gradient_indices.begin(), gradient_indices.end(), 0);
+
+    // value_out will hold the output of the python function in order to be
+    // able to reconstruct the python tree of extra return values
+    py::object py_value_out;
+    auto value_and_grads = value_and_grad(
+        [&fun,
+         &args,
+         &kwargs,
+         &argnums,
+         &argnames,
+         &counts,
+         &py_value_out,
+         &error_msg_tag,
+         scalar_func_only](const std::vector<array>& a) {
+          // Copy the arguments
+          py::args args_cpy = py::tuple(args.size());
+          py::kwargs kwargs_cpy = py::kwargs();
+          int j = 0;
+          for (int i = 0; i < args.size(); ++i) {
+            if (j < argnums.size() && i == argnums[j]) {
+              args_cpy[i] = tree_unflatten(args[i], a, counts[j]);
+              j++;
+            } else {
+              args_cpy[i] = args[i];
+            }
+          }
+          for (auto& key : argnames) {
+            kwargs_cpy[key.c_str()] =
+                tree_unflatten(kwargs[key.c_str()], a, counts[j]);
+            j++;
+          }
+          for (auto item : kwargs) {
+            if (kwargs_cpy.contains(item.first)) {
+              continue;
+            }
+            kwargs_cpy[item.first] = item.second;
+          }
+
+          // Call the python function
+          py_value_out = fun(*args_cpy, **kwargs_cpy);
+
+          // Validate the return value of the python function
+          if (!py::isinstance<array>(py_value_out)) {
+            if (scalar_func_only) {
+              std::ostringstream msg;
+              msg << error_msg_tag << " The return value of the function "
+                  << "whose gradient we want to compute should be a "
+                  << "scalar array; but " << py_value_out.get_type()
+                  << " was returned.";
+              throw std::invalid_argument(msg.str());
+            }
+            if (!py::isinstance<py::tuple>(py_value_out)) {
+              std::ostringstream msg;
+              msg << error_msg_tag << " The return value of the function "
+                  << "whose gradient we want to compute should be either a "
+                  << "scalar array or a tuple with the first value being a "
+                  << "scalar array (Union[array, Tuple[array, Any, ...]]); but "
+                  << py_value_out.get_type() << " was returned.";
+              throw std::invalid_argument(msg.str());
+            }
+            py::tuple ret = py::cast<py::tuple>(py_value_out);
+            if (ret.size() == 0) {
+              std::ostringstream msg;
+              msg << error_msg_tag << " The return value of the function "
+                  << "whose gradient we want to compute should be either a "
+                  << "scalar array or a non-empty tuple. The first value should be a "
+                  << "scalar array and the rest can be anything. Instead, "
+                  << "we got an empty tuple.";
+              throw std::invalid_argument(msg.str());
+            }
+            if (!py::isinstance<array>(ret[0])) {
+              std::ostringstream msg;
+              msg << error_msg_tag << " The return value of the function "
+                  << "whose gradient we want to compute should be either a "
+                  << "scalar array or a tuple with the first value being a "
+                  << "scalar array (Union[array, Tuple[array, Any, ...]]); but it "
+                  << "was a tuple with the first value being of type "
+                  << ret[0].get_type() << " .";
+              throw std::invalid_argument(msg.str());
+            }
+          }
+
+          return tree_flatten(py_value_out, false);
+        },
+        gradient_indices)(arrays);
+
+    auto value = value_and_grads.first;
+    auto gradients = value_and_grads.second;
+
+    // Put the gradients back in their container.
+    // We have the following cases:
+    //
+    // 1. Single python positional argument has a gradient (eg argnums=[0])
+    // 2. Many python positional arguments have gradients (eg argnums=[0, 1])
+    // 3. A python keyword argument has gradients
+    //
+    // In case 1 we return the original python variable but with the gradients.
+    // In case 2 we return a tuple of the above.
+    // In case 3 we return a tuple containing a tuple and dict (sth like
+    // (tuple(), dict(x=mx.array(5))) ).
+    py::object positional_grads;
+    py::object keyword_grads;
+    py::object py_grads;
+
+    // Collect the gradients for the positional arguments
+    if (argnums.size() == 1) {
+      positional_grads = tree_unflatten(args[argnums[0]], gradients, counts[0]);
+    } else if (argnums.size() > 1) {
+      py::tuple grads_(argnums.size());
+      for (int i = 0; i < argnums.size(); i++) {
+        grads_[i] = tree_unflatten(args[argnums[i]], gradients, counts[i]);
+      }
+      positional_grads = py::cast<py::object>(grads_);
+    } else {
+      positional_grads = py::none();
+    }
+
+    // No keyword argument gradients so return the tuple of gradients
+    if (argnames.size() == 0) {
+      py_grads = positional_grads;
+    } else {
+      py::dict grads_;
+      for (int i = 0; i < argnames.size(); i++) {
+        auto& k = argnames[i];
+        grads_[k.c_str()] = tree_unflatten(
+            kwargs[k.c_str()], gradients, counts[i + argnums.size()]);
+      }
+      keyword_grads = py::cast<py::object>(grads_);
+
+      py_grads =
+          py::cast<py::object>(py::make_tuple(positional_grads, keyword_grads));
+    }
+
+    // Put the values back in the container
+    py::object return_value = tree_unflatten(py_value_out, value);
+    return std::make_pair(return_value, py_grads);
+  };
+}
+
+auto py_vmap(
+    const py::function& fun,
+    const py::object& in_axes,
+    const py::object& out_axes) {
+  return [fun, in_axes, out_axes](const py::args& args) {
+    auto axes_to_flat_tree = [](const py::object& tree,
+                                const py::object& axes) {
+      auto tree_axes = tree_map(
+          {tree, axes},
+          [](const std::vector<py::object>& inputs) { return inputs[1]; });
+      std::vector<int> flat_axes;
+      tree_visit(tree_axes, [&flat_axes](py::handle obj) {
+        if (obj.is_none()) {
+          flat_axes.push_back(-1);
+        } else if (py::isinstance<py::int_>(obj)) {
+          flat_axes.push_back(py::cast<int>(py::cast<py::int_>(obj)));
+        } else {
+          throw std::invalid_argument("[vmap] axis must be int or None.");
+        }
+      });
+      return flat_axes;
+    };
+
+    // Inputs must be array or tree of arrays
+    auto inputs = tree_flatten(args, true);
+    auto flat_in_axes = axes_to_flat_tree(args, in_axes);
+
+    // py_value_out will hold the output of the python function in order to be
+    // able to reconstruct the python tree of extra return values
+    py::object py_outputs;
+
+    auto vmap_fn =
+        [&fun, &args, &inputs, &py_outputs](const std::vector<array>& a) {
+          // Call the python function
+          py_outputs = fun(*tree_unflatten(args, a));
+
+          // Flatten the outputs
+          return tree_flatten(py_outputs, true);
+        };
+
+    auto [trace_inputs, trace_outputs] =
+        detail::vmap_trace(vmap_fn, inputs, flat_in_axes);
+
+    auto flat_out_axes = axes_to_flat_tree(py_outputs, out_axes);
+
+    // Perform the vmap
+    auto outputs = detail::vmap_replace(
+        inputs, trace_inputs, trace_outputs, flat_in_axes, flat_out_axes);
+
+    // Put the outputs back in the container
+    return tree_unflatten(py_outputs, outputs);
+  };
+}
+
+void init_transforms(py::module_& m) {
+  m.def(
+      "eval",
+      [](const py::args& args, bool retain_graph) {
+        std::vector<array> arrays = tree_flatten(args);
+        eval(arrays, retain_graph);
+      },
+      "retain_graph"_a = false,
+      R"pbdoc(
+        Evaluate an :class:`array` or tree of :class:`array`.
+
+        Args:
+            *args (arrays or trees of arrays): Each argument can be a single array
+              or a tree of arrays. If a tree is given the nodes can be a Python
+              :class:`list`, :class:`tuple` or :class:`dict` but the leafs must all be
+              an :class:`array`.
+            retain_graph (bool): Indicate that the graph structure should be
+              preserved. This option is intended to enable function transforms
+              which contain control flow based on the value of an array.
+      )pbdoc");
+  m.def(
+      "jvp",
+      [](const py::function& fun,
+         const std::vector<array>& primals,
+         const std::vector<array>& tangents) {
+        auto vfun = [&fun](const std::vector<array>& primals) {
+          py::args args = py::tuple(primals.size());
+          for (int i = 0; i < primals.size(); ++i) {
+            args[i] = primals[i];
+          }
+          auto out = fun(*args);
+          if (py::isinstance<array>(out)) {
+            return std::vector<array>{py::cast<array>(out)};
+          } else {
+            return py::cast<std::vector<array>>(out);
+          }
+        };
+        return jvp(vfun, primals, tangents);
+      },
+      "fun"_a,
+      "primals"_a,
+      "tangents"_a,
+      R"pbdoc(
+        Compute the Jacobian-vector product.
+
+        This computes the product of the Jacobian of a function ``fun`` evaluated
+        at ``primals`` with the ``tangents``.
+
+        Args:
+            fun (function): A function which takes a variable number of :class:`array`
+              and returns a single :class:`array` or list of :class:`array`.
+            primals (list(array)): A list of :class:`array` at which to
+              evaluate the Jacobian.
+            tangents (list(array)): A list of :class:`array` which are the
+              "vector" in the Jacobian-vector product. The ``tangents`` should be the
+              same in number, shape, and type as the inputs of ``fun`` (i.e. the ``primals``).
+
+        Returns:
+            list(array): A list of the Jacobian-vector products which
+            is the same in number, shape, and type of the inputs to ``fun``.
+      )pbdoc");
+  m.def(
+      "vjp",
+      [](const py::function& fun,
+         const std::vector<array>& primals,
+         const std::vector<array>& cotangents) {
+        auto vfun = [&fun](const std::vector<array>& primals) {
+          py::args args = py::tuple(primals.size());
+          for (int i = 0; i < primals.size(); ++i) {
+            args[i] = primals[i];
+          }
+          auto out = fun(*args);
+          if (py::isinstance<array>(out)) {
+            return std::vector<array>{py::cast<array>(out)};
+          } else {
+            return py::cast<std::vector<array>>(out);
+          }
+        };
+        return vjp(vfun, primals, cotangents);
+      },
+      "fun"_a,
+      "primals"_a,
+      "cotangents"_a,
+      R"pbdoc(
+        Compute the vector-Jacobian product.
+
+        Computes the product of the ``cotangents`` with the Jacobian of a
+        function ``fun`` evaluated at ``primals``.
+
+        Args:
+          fun (function): A function which takes a variable number of :class:`array`
+            and returns a single :class:`array` or list of :class:`array`.
+          primals (list(array)): A list of :class:`array` at which to
+            evaluate the Jacobian.
+          cotangents (list(array)): A list of :class:`array` which are the
+            "vector" in the vector-Jacobian product. The ``cotangents`` should be the
+            same in number, shape, and type as the outputs of ``fun``.
+
+        Returns:
+            list(array): A list of the vector-Jacobian products which
+            is the same in number, shape, and type of the outputs of ``fun``.
+      )pbdoc");
+  m.def(
+      "value_and_grad",
+      [](const py::function& fun,
+         const std::optional<IntOrVec>& argnums,
+         const StrOrVec& argnames) {
+        auto [argnums_vec, argnames_vec] =
+            validate_argnums_argnames(argnums, argnames);
+        return py::cpp_function(py_value_and_grad(
+            fun, argnums_vec, argnames_vec, "[value_and_grad]", false));
+      },
+      "fun"_a,
+      "argnums"_a = std::nullopt,
+      "argnames"_a = std::vector<std::string>{},
+      R"pbdoc(
+        Returns a function which computes the value and gradient of ``fun``.
+
+        The function passed to :func:`value_and_grad` should return either
+        a scalar loss or a tuple in which the first element is a scalar
+        loss and the remaining elements can be anything.
+
+        .. code-block:: python
+
+            import mlx.core as mx
+
+            def mse(params, inputs, targets):
+                outputs = forward(params, inputs)
+                lvalue = (outputs - targets).square().mean()
+                return lvalue
+
+            # Returns lvalue, dlvalue/dparams
+            lvalue, grads = mx.value_and_grad(mse)
+
+            def lasso(params, inputs, targets, a=1.0, b=1.0):
+                outputs = forward(params, inputs)
+                mse = (outputs - targets).square().mean()
+                l1 = mx.abs(outputs - targets).mean()
+
+                loss = a*mse + b*l1
+
+                return loss, mse, l1
+
+            (loss, mse, l1), grads = mx.value_and_grad(lasso)
+
+        Args:
+            fun (function): A function which takes a variable number of
+              :class:`array` or trees of :class:`array` and returns
+              a scalar output :class:`array` or a tuple the first element
+              of which should be a scalar :class:`array`.
+            argnums (int or list(int), optional): Specify the index (or indices)
+              of the positional arguments of ``fun`` to compute the gradient
+              with respect to. If neither ``argnums`` nor ``argnames`` are
+              provided ``argnums`` defaults to ``0`` indicating ``fun``'s first
+              argument.
+            argnames (str or list(str), optional): Specify keyword arguments of
+              ``fun`` to compute gradients with respect to. It defaults to [] so
+              no gradients for keyword arguments by default.
+
+        Returns:
+            function: A function which returns a tuple where the first element
+            is the output of `fun` and the second element is the gradients w.r.t.
+            the loss.
+      )pbdoc");
+  m.def(
+      "grad",
+      [](const py::function& fun,
+         const std::optional<IntOrVec>& argnums,
+         const StrOrVec& argnames) {
+        auto [argnums_vec, argnames_vec] =
+            validate_argnums_argnames(argnums, argnames);
+        auto fn =
+            py_value_and_grad(fun, argnums_vec, argnames_vec, "[grad]", true);
+        return py::cpp_function(
+            [fn](const py::args& args, const py::kwargs& kwargs) {
+              return fn(args, kwargs).second;
+            });
+      },
+      "fun"_a,
+      "argnums"_a = std::nullopt,
+      "argnames"_a = std::vector<std::string>{},
+      R"pbdoc(
+        Returns a function which computes the gradient of ``fun``.
+
+        Args:
+            fun (function): A function which takes a variable number of
+              :class:`array` or trees of :class:`array` and returns
+              a scalar output :class:`array`.
+            argnums (int or list(int), optional): Specify the index (or indices)
+              of the positional arguments of ``fun`` to compute the gradient
+              with respect to. If neither ``argnums`` nor ``argnames`` are
+              provided ``argnums`` defaults to ``0`` indicating ``fun``'s first
+              argument.
+            argnames (str or list(str), optional): Specify keyword arguments of
+              ``fun`` to compute gradients with respect to. It defaults to [] so
+              no gradients for keyword arguments by default.
+
+        Returns:
+            function: A function which has the same input arguments as ``fun`` and
+            returns the gradient(s).
+      )pbdoc");
+  m.def(
+      "vmap",
+      [](const py::function& fun,
+         const py::object& in_axes,
+         const py::object& out_axes) {
+        return py::cpp_function(py_vmap(fun, in_axes, out_axes));
+      },
+      "fun"_a,
+      "in_axes"_a = 0,
+      "out_axes"_a = 0,
+      R"pbdoc(
+        Returns a vectorized version of ``fun``.
+
+        Args:
+            fun (function): A function which takes a variable number of
+              :class:`array` or a tree of :class:`array` and returns
+              a variable number of :class:`array` or a tree of :class:`array`.
+            in_axes (int, optional): An integer or a valid prefix tree of the
+              inputs to ``fun`` where each node specifies the vmapped axis. If
+              the value is ``None`` then the corresponding input(s) are not vmapped.
+              Defaults to ``0``.
+            out_axes (int, optional): An integer or a valid prefix tree of the
+              outputs of ``fun`` where each node specifies the vmapped axis. If
+              the value is ``None`` then the corresponding outputs(s) are not vmapped.
+              Defaults to ``0``.
+
+        Returns:
+            function: The vectorized function.
+      )pbdoc");
+  m.def(
+      "simplify",
+      [](const py::args& args) {
+        std::vector<array> arrays = tree_flatten(args);
+        simplify(arrays);
+      },
+      R"pbdoc(
+        Simplify the graph that computes the arrays.
+
+        Run a few fast graph simplification operations to reuse computation and
+        reduce memory consumption. This function is meant to be run every time
+        so its overhead should be small, approximately 1ms for a graph with a
+        few thousand nodes.
+
+        .. code-block:: python
+
+          import mlx.core as mx
+
+          def foo(x):
+            y = x @ x
+            z = x @ x
+            return y + z
+
+          x = mx.ones((10, 10))
+          y = foo(x)
+          z = foo(x)
+
+          # Computes the matmul twice
+          mx.eval(y)
+
+          # Computes the matmul once
+          mx.simplify(z)
+          mx.eval(z)
+
+        Args:
+          args: Any number of arrays and/or trees of arrays to be simplified.
+      )pbdoc");
+  m.def(
+      "export_to_dot",
+      [](py::object file, const py::args& args) {
+        std::vector<array> arrays = tree_flatten(args);
+        if (py::isinstance<py::str>(file)) {
+          std::ofstream out(py::cast<std::string>(file));
+          export_to_dot(out, arrays);
+        } else if (py::hasattr(file, "write")) {
+          std::ostringstream out;
+          export_to_dot(out, arrays);
+          auto write = file.attr("write");
+          write(out.str());
+        } else {
+          throw std::invalid_argument(
+              "export_to_dot accepts file-like objects or strings to be used as filenames");
+        }
+      },
+      "file"_a);
+}
--- a/python/tests/mlx_tests.py
+++ b/python/tests/mlx_tests.py
@ -0,0 +1,16 @@
+import os
+import unittest
+
+import mlx.core as mx
+
+
+class MLXTestCase(unittest.TestCase):
+    def setUp(self):
+        self.default = mx.default_device()
+        device = os.getenv("DEVICE", None)
+        if device is not None:
+            device = getattr(mx, device)
+            mx.set_default_device(device)
+
+    def tearDown(self):
+        mx.set_default_device(self.default)
--- a/python/tests/test_blas.py
+++ b/python/tests/test_blas.py
@ -0,0 +1,445 @@
+import unittest
+from itertools import permutations
+
+import math
+import mlx.core as mx
+import numpy as np
+
+import mlx_tests
+
+
+class TestBlas(mlx_tests.MLXTestCase):
+    @property
+    def dtypes(self):
+        return ["float32", "float16"] if mx.metal.is_available() else ["float32"]
+
+    def __gemm_test(
+        self,
+        shape_a,
+        shape_b,
+        np_dtype=np.float32,
+        f_np_a=lambda x: x,
+        f_np_b=lambda x: x,
+        f_mx_a=lambda x: x,
+        f_mx_b=lambda x: x,
+    ):
+        with self.subTest(
+            dtype=np.dtype(np_dtype).name, shape_a=shape_a, shape_b=shape_b
+        ):
+            np.random.seed(42)
+            scale = max(np.sum(shape_a), 128)
+            a_np = np.random.normal(0.0, 1.0 / scale, shape_a).astype(np_dtype)
+            b_np = np.random.normal(0.0, 1.0 / scale, shape_b).astype(np_dtype)
+
+            a_mx = mx.array(a_np)
+            b_mx = mx.array(b_np)
+
+            a_np = f_np_a(a_np.astype(np.float32))
+            b_np = f_np_b(b_np.astype(np.float32))
+            a_mx = f_mx_a(a_mx)
+            b_mx = f_mx_b(b_mx)
+
+            out_npy = a_np @ b_np
+            out_mlx = a_mx @ b_mx
+
+            self.assertListEqual(list(out_npy.shape), list(out_mlx.shape))
+            self.assertTrue(np.allclose(out_mlx, out_npy.astype(np_dtype), atol=1e-5))
+
+    def test_matmul_unaligned(self):
+        if not mx.metal.is_available():
+            return
+
+        for dtype in self.dtypes:
+            np_dtype = getattr(np, dtype)
+            base_shapes = [4, 8, 16, 32, 64, 128]
+            pertubations = [-2, -1, 0, 1, 2]
+
+            for dim in base_shapes:
+                for p in pertubations:
+                    shape_a = (dim + p, dim + p)
+                    shape_b = (dim + p, dim + p)
+                    self.__gemm_test(shape_a, shape_b, np_dtype)
+
+    def test_matmul_shapes(self):
+        if not mx.metal.is_available():
+            return
+
+        shapes = [
+            (1, 2, 1, 1),
+            (1, 1, 2, 1),
+            (3, 23, 457, 3),
+        ]
+
+        if mx.default_device() == mx.gpu:
+            shapes += [
+                (16, 768, 768, 128),
+            ]
+
+        for dtype in self.dtypes:
+            np_dtype = getattr(np, dtype)
+
+            for B, M, N, K in shapes:
+
+                with self.subTest(tranpose="nn"):
+                    shape_a = (B, M, K)
+                    shape_b = (B, K, N)
+                    self.__gemm_test(shape_a, shape_b, np_dtype)
+
+                with self.subTest(tranpose="nt"):
+                    shape_a = (B, M, K)
+                    shape_b = (B, N, K)
+                    self.__gemm_test(
+                        shape_a,
+                        shape_b,
+                        np_dtype,
+                        f_np_b=lambda x: np.transpose(x, (0, 2, 1)),
+                        f_mx_b=lambda x: mx.transpose(x, (0, 2, 1)),
+                    )
+
+                with self.subTest(tranpose="tn"):
+                    shape_a = (B, K, M)
+                    shape_b = (B, K, N)
+                    self.__gemm_test(
+                        shape_a,
+                        shape_b,
+                        np_dtype,
+                        f_np_a=lambda x: np.transpose(x, (0, 2, 1)),
+                        f_mx_a=lambda x: mx.transpose(x, (0, 2, 1)),
+                    )
+
+                with self.subTest(tranpose="tt"):
+                    shape_a = (B, K, M)
+                    shape_b = (B, N, K)
+                    self.__gemm_test(
+                        shape_a,
+                        shape_b,
+                        np_dtype,
+                        f_np_a=lambda x: np.transpose(x, (0, 2, 1)),
+                        f_mx_a=lambda x: mx.transpose(x, (0, 2, 1)),
+                        f_np_b=lambda x: np.transpose(x, (0, 2, 1)),
+                        f_mx_b=lambda x: mx.transpose(x, (0, 2, 1)),
+                    )
+
+    def test_matmul(self):
+        # Note: so far, matmul only works with floating-point types
+        a = mx.array([[1.0, 2.0], [3.0, 4.0]])
+
+        b = mx.array([[0.0, -1.0], [-3.0, 3.0]])
+
+        expected = [[-6.0, 5.0], [-12.0, 9.0]]
+
+        self.assertEqual((a @ b).tolist(), expected)
+        self.assertEqual(mx.matmul(a, b).tolist(), expected)
+
+        # Transposed matmul
+        np.random.seed(0)
+        a_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
+        c_npy = a_npy @ np.transpose(b_npy, (1, 0))
+        d_npy = np.transpose(a_npy, (1, 0)) @ b_npy
+
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+        c_mlx = a_mlx @ mx.transpose(b_mlx, (1, 0))
+        d_mlx = mx.transpose(a_mlx, (1, 0)) @ b_mlx
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
+
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+        self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-6))
+
+    def test_matmul_dtypes(self):
+
+        for dt in self.dtypes:
+            a_npy = np.random.normal(0.0, 1.0 / 256, (16, 16, 16)).astype(
+                getattr(np, dt)
+            )
+            b_npy = np.random.normal(0.0, 1.0 / 256, (16, 16, 16)).astype(
+                getattr(np, dt)
+            )
+            a_mlx = mx.array(a_npy)
+            b_mlx = mx.array(b_npy)
+
+            c_npy = np.matmul(a_npy, b_npy, dtype=getattr(np, dt))
+            c_mlx = a_mlx @ b_mlx
+
+            self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+    def test_matmul_batched(self):
+        np.random.seed(0)
+        # Batched matmul
+        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 16)).astype(np.float32)
+        c_npy = a_npy @ b_npy
+
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+        c_mlx = a_mlx @ b_mlx
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+        # Batched and transposed matmul
+        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+        c_npy = a_npy @ np.transpose(b_npy, (0, 2, 1))
+
+        b_mlx = mx.array(b_npy)
+        c_mlx = a_mlx @ mx.transpose(b_mlx, (0, 2, 1))
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+        # Batched matmul with simple broadast
+        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (16, 16)).astype(np.float32)
+        c_npy = a_npy @ b_npy
+
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+        c_mlx = a_mlx @ b_mlx
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+        # Both operands broadcasted
+        d_npy = np.broadcast_to(b_npy, (5, 16, 16))
+        d_mlx = mx.broadcast_to(b_mlx, (5, 16, 16))
+
+        e_npy = d_npy @ d_npy
+        e_mlx = d_mlx @ d_mlx
+
+        self.assertListEqual(list(e_npy.shape), list(e_mlx.shape))
+        self.assertTrue(np.allclose(e_mlx, e_npy, atol=1e-6))
+
+        # Batched and transposed matmul with simple broadast
+        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+
+        c_npy = a_npy @ np.transpose(b_npy, (1, 0))
+        c_mlx = a_mlx @ mx.transpose(b_mlx, (1, 0))
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+        # Matmul with vector
+        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+
+        c_npy = a_npy @ b_npy
+        c_mlx = a_mlx @ b_mlx
+
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+        # Test Multiheaded attention style matmul
+        a_npy = np.random.normal(0.0, 1.0 / 128, (64, 16, 4, 32)).astype(np.float32)
+        b_npy = np.random.normal(0.0, 1.0 / 128, (64, 16, 4, 32)).astype(np.float32)
+        a_mlx = mx.array(a_npy)
+        b_mlx = mx.array(b_npy)
+
+        a_npy = np.transpose(a_npy, (0, 2, 1, 3))
+        b_npy = np.transpose(b_npy, (0, 2, 1, 3))
+        a_mlx = mx.transpose(a_mlx, (0, 2, 1, 3))
+        b_mlx = mx.transpose(b_mlx, (0, 2, 1, 3))
+
+        c_npy = a_npy @ np.transpose(b_npy, (0, 1, 3, 2))
+        c_mlx = a_mlx @ mx.transpose(b_mlx, (0, 1, 3, 2))
+        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
+        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
+
+    def __gemv_test(
+        self,
+        shape_mat,
+        shape_vec,
+        np_dtype=np.float32,
+        mat_first=True,
+        np_mat_f=lambda x: x,
+        np_vec_f=lambda x: x,
+        mlx_mat_f=lambda x: x,
+        mlx_vec_f=lambda x: x,
+    ):
+        with self.subTest(shape=shape_mat):
+            np.random.seed(42)
+            scale = max(np.sum(shape_mat), 32)
+            mat_npy = np.random.normal(0.0, 1.0 / scale, shape_mat).astype(np_dtype)
+            vec_npy = np.random.normal(0.0, 1.0 / scale, shape_vec).astype(np_dtype)
+
+            mat_mlx = mx.array(mat_npy)
+            vec_mlx = mx.array(vec_npy)
+
+            mat_npy = np_mat_f(mat_npy)
+            vec_npy = np_vec_f(vec_npy)
+            mat_mlx = mlx_mat_f(mat_mlx)
+            vec_mlx = mlx_vec_f(vec_mlx)
+
+            if mat_first:
+                out_npy = mat_npy @ vec_npy
+                out_mlx = mat_mlx @ vec_mlx
+            else:
+                out_npy = vec_npy @ mat_npy
+                out_mlx = vec_mlx @ mat_mlx
+
+            self.assertListEqual(list(out_npy.shape), list(out_mlx.shape))
+            self.assertTrue(np.allclose(out_mlx, out_npy, atol=1e-5))
+
+    def test_matrix_vector(self):
+        for dtype in self.dtypes:
+            with self.subTest(dtype=dtype):
+                np_dtype = getattr(np, dtype)
+
+                # Basic square matrix test
+                self.__gemv_test(
+                    shape_mat=(64, 64), shape_vec=(64, 1), np_dtype=np_dtype
+                )
+                self.__gemv_test(
+                    shape_mat=(64, 64),
+                    shape_vec=(64, 1),
+                    np_dtype=np_dtype,
+                    mat_first=False,
+                    np_vec_f=lambda x: np.transpose(x, (1, 0)),
+                    mlx_vec_f=lambda x: mx.transpose(x, (1, 0)),
+                )
+
+                # Vector matrix product with aligned and unaligned shapes
+                for in_len_base, out_len_base in (
+                    (2, 2),
+                    (32, 32),
+                    (64, 64),
+                    (2048, 2048),
+                ):
+                    for mi in (-1, 0, 1):
+                        for mj in (-1, 0, 1):
+                            # Vec mat
+                            shape_mat = (in_len_base + mi, out_len_base + mj)
+                            shape_vec = (1, in_len_base + mi)
+                            self.__gemv_test(
+                                shape_mat, shape_vec, mat_first=False, np_dtype=np_dtype
+                            )
+
+                            # Mat vec
+                            shape_mat = (out_len_base + mj, in_len_base + mi)
+                            shape_vec = (in_len_base + mi, 1)
+                            self.__gemv_test(
+                                shape_mat, shape_vec, mat_first=True, np_dtype=np_dtype
+                            )
+
+    def test_matrix_vector_batched(self):
+        for dtype in self.dtypes:
+            with self.subTest(dtype=dtype):
+                np_dtype = getattr(np, dtype)
+
+                # Batched mat vec
+                for shape_mat, shape_vec in (
+                    ((32, 128, 64), (32, 64, 1)),
+                    ((128, 64), (32, 64, 1)),
+                    ((32, 128, 64), (64, 1)),
+                ):
+                    self.__gemv_test(
+                        shape_mat, shape_vec, mat_first=True, np_dtype=np_dtype
+                    )
+
+                # Batched vec mat
+                for shape_vec, shape_mat in (
+                    ((32, 1, 128), (32, 128, 64)),
+                    ((32, 1, 128), (128, 64)),
+                    ((1, 128), (32, 128, 64)),
+                ):
+                    self.__gemv_test(
+                        shape_mat, shape_vec, mat_first=False, np_dtype=np_dtype
+                    )
+
+    def test_matrix_vector_broadcast(self):
+        for dtype in self.dtypes:
+            with self.subTest(dtype=dtype):
+                np_dtype = getattr(np, dtype)
+
+                # Different broadcasts mat vec
+                for shape_mat, shape_vec in (
+                    ((32, 64, 64), (32, 64, 1)),
+                    ((64, 64), (32, 64, 1)),
+                    ((32, 64, 64), (64, 1)),
+                ):
+                    self.__gemv_test(
+                        shape_mat=(64, 64),
+                        shape_vec=(64, 1),
+                        np_dtype=np_dtype,
+                        np_mat_f=(lambda mat_npy: np.broadcast_to(mat_npy, shape_mat)),
+                        np_vec_f=(lambda vec_npy: np.broadcast_to(vec_npy, shape_vec)),
+                        mlx_mat_f=(lambda mat_mlx: mx.broadcast_to(mat_mlx, shape_mat)),
+                        mlx_vec_f=(lambda vec_mlx: mx.broadcast_to(vec_mlx, shape_vec)),
+                    )
+
+                # Different broadcasts vec mat
+                for shape_vec, shape_mat in (
+                    ((32, 1, 64), (32, 64, 64)),
+                    ((32, 1, 64), (64, 64)),
+                    ((1, 64), (32, 64, 64)),
+                ):
+                    self.__gemv_test(
+                        shape_mat=(64, 64),
+                        shape_vec=(1, 64),
+                        np_dtype=np_dtype,
+                        mat_first=False,
+                        np_mat_f=lambda mat_npy: np.broadcast_to(mat_npy, shape_mat),
+                        np_vec_f=lambda vec_npy: np.broadcast_to(vec_npy, shape_vec),
+                        mlx_mat_f=lambda mat_mlx: mx.broadcast_to(mat_mlx, shape_mat),
+                        mlx_vec_f=lambda vec_mlx: mx.broadcast_to(vec_mlx, shape_vec),
+                    )
+
+    def test_matrix_vector_edgecases(self):
+        for dtype in self.dtypes:
+            with self.subTest(dtype=dtype):
+                np_dtype = getattr(np, dtype)
+
+                for in_vec_len in np.arange(1, 5):
+                    for out_vec_len in np.arange(1, 5):
+                        for batch_size in np.arange(1, 5):
+                            with self.subTest(
+                                problem_shape=(batch_size, in_vec_len, out_vec_len)
+                            ):
+                                # Matrix vector
+                                with self.subTest(transpose=False):
+                                    a_npy = np.ones(
+                                        (batch_size, out_vec_len, in_vec_len),
+                                        dtype=np_dtype,
+                                    )
+                                    b_npy = np.ones(
+                                        (batch_size, in_vec_len, 1), dtype=np_dtype
+                                    )
+                                    for i in range(batch_size):
+                                        b_npy[i] *= i + 1.0
+
+                                    a_mlx, b_mlx = map(mx.array, [a_npy, b_npy])
+                                    c_npy = a_npy @ b_npy
+                                    c_mlx = a_mlx @ b_mlx
+
+                                    self.assertListEqual(
+                                        list(c_npy.shape), list(c_mlx.shape)
+                                    )
+                                    self.assertTrue(np.array_equal(c_mlx, c_npy))
+
+                                # Vector matrix
+                                with self.subTest(transpose=True):
+                                    a_npy = np.ones(
+                                        (batch_size, out_vec_len, in_vec_len),
+                                        dtype=np_dtype,
+                                    )
+                                    b_npy = np.ones(
+                                        (batch_size, 1, out_vec_len), dtype=np_dtype
+                                    )
+                                    for i in range(batch_size):
+                                        b_npy[i] *= i + 1.0
+
+                                    a_mlx, b_mlx = map(mx.array, [a_npy, b_npy])
+                                    c_npy = b_npy @ a_npy
+                                    c_mlx = b_mlx @ a_mlx
+
+                                    self.assertListEqual(
+                                        list(c_npy.shape), list(c_mlx.shape)
+                                    )
+                                    self.assertTrue(np.array_equal(c_mlx, c_npy))
--- a/python/tests/test_conv.py
+++ b/python/tests/test_conv.py
@ -0,0 +1,445 @@
+import unittest
+from itertools import permutations
+
+import math
+import mlx.core as mx
+import numpy as np
+
+import mlx_tests
+
+try:
+    import torch
+    import torch.nn.functional as F
+
+    has_torch = True
+except ImportError as e:
+    has_torch = False
+
+
+class TestConv(mlx_tests.MLXTestCase):
+    def test_numpy_conv(self):
+        for dtype in (
+            "float16",
+            "float32",
+        ):
+            np_dtype = getattr(np, dtype)
+            for M, N, mode in (
+                (1, 1, "full"),
+                (25, 5, "full"),
+                (24, 5, "same"),
+                (24, 4, "same"),
+                (24, 4, "valid"),
+                (4, 24, "full"),
+                (5, 25, "same"),
+                (4, 25, "valid"),
+            ):
+                with self.subTest(dtype=dtype, M=M, N=N, mode=mode):
+                    atol = 1e-6 if dtype == "float32" else 1e-5
+                    a_np = np.random.rand(M).astype(np_dtype)
+                    v_np = np.random.rand(N).astype(np_dtype)
+                    a_mx = mx.array(a_np)
+                    v_mx = mx.array(v_np)
+
+                    c_np = np.convolve(a_np, v_np, mode=mode)
+                    c_mx = mx.convolve(a_mx, v_mx, mode=mode)
+
+                    self.assertListEqual(list(c_mx.shape), list(c_np.shape))
+                    self.assertTrue(np.allclose(c_mx, c_np, atol=atol))
+
+    @unittest.skipIf(not has_torch, "requires Torch")
+    def test_torch_conv_1D(self):
+        def run_conv1D(
+            N,
+            C,
+            O,
+            iH,
+            kH,
+            stride,
+            padding,
+            dilation=1,
+            groups=1,
+            dtype="float32",
+            atol=1e-5,
+        ):
+            with self.subTest(
+                dtype=dtype,
+                N=N,
+                C=C,
+                O=O,
+                iH=iH,
+                kH=kH,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            ):
+                np_dtype = getattr(np, dtype)
+                np.random.seed(0)
+                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
+                wt_np = np.random.normal(0, 1.0 / C, (O, kH, C)).astype(np_dtype)
+
+                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
+                in_pt, wt_pt = map(
+                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)), (in_np, wt_np)
+                )
+
+                out_mx = mx.conv1d(
+                    in_mx,
+                    wt_mx,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                out_pt = torch.conv1d(
+                    in_pt,
+                    wt_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                out_pt = torch.transpose(out_pt, 2, 1)
+
+                self.assertListEqual(list(out_pt.shape), out_mx.shape)
+                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=atol))
+
+        for dtype in ("float32",):
+            for N, C, O in (
+                (1, 1, 1),
+                (1, 6, 1),
+                (1, 1, 6),
+                (4, 32, 64),
+            ):
+                for iH, kH, stride, padding in (
+                    (1, 1, 1, 0),
+                    (3, 3, 1, 0),
+                    (31, 5, 5, 2),
+                ):
+                    run_conv1D(N, C, O, iH, kH, stride, padding, dtype=dtype)
+
+        # Strided inputs tests
+        for tpose_in, tpose_wt in (
+            ((0, 2, 1), (0, 1, 2)),
+            ((0, 2, 1), (0, 2, 1)),
+        ):
+            with self.subTest(name="strided", tpose_in=tpose_in, tpose_wt=tpose_wt):
+                in_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)
+                wt_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)
+
+                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
+                in_mx_t = mx.transpose(in_mx, tpose_in)
+                wt_mx_t = mx.transpose(wt_mx, tpose_wt)
+                out_mx = mx.conv1d(in_mx_t, wt_mx_t)
+
+                in_pt, wt_pt = map(
+                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)),
+                    (in_np.transpose(tpose_in), wt_np.transpose(tpose_wt)),
+                )
+
+                out_pt = torch.conv1d(in_pt, wt_pt)
+                out_pt = torch.transpose(out_pt, 2, 1)
+
+                self.assertListEqual(list(out_pt.shape), out_mx.shape)
+                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=1e-5))
+
+    @unittest.skipIf(not has_torch, "requires Torch")
+    def test_torch_conv_1D_grad(self):
+        def run_conv1D_grad(
+            N,
+            C,
+            O,
+            iH,
+            kH,
+            stride,
+            padding,
+            dilation=1,
+            groups=1,
+            dtype="float32",
+            atol=1e-5,
+        ):
+            with self.subTest(
+                dtype=dtype,
+                N=N,
+                C=C,
+                O=O,
+                iH=iH,
+                kH=kH,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            ):
+                np_dtype = getattr(np, dtype)
+                np.random.seed(0)
+                oH = 1 + ((iH + 2 * padding - dilation * (kH - 1) - 1) // stride)
+
+                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
+                wt_np = np.random.normal(0, 1.0 / C, (O, kH, C)).astype(np_dtype)
+                ct_np = np.random.normal(0, 1.0 / C, (N, oH, O)).astype(np_dtype)
+
+                in_mx, wt_mx, ct_mx = map(mx.array, (in_np, wt_np, ct_np))
+                in_pt, wt_pt, ct_pt = map(
+                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)),
+                    (in_np, wt_np, ct_np),
+                )
+
+                def f(a, b):
+                    return mx.conv1d(
+                        a,
+                        b,
+                        stride=stride,
+                        padding=padding,
+                        dilation=dilation,
+                        groups=groups,
+                    )
+
+                _, outs_mx = mx.vjp(
+                    f,
+                    [
+                        in_mx,
+                        wt_mx,
+                    ],
+                    [
+                        ct_mx,
+                    ],
+                )
+                pt_grad_in = F.grad.conv1d_input(
+                    in_pt.shape,
+                    wt_pt,
+                    ct_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                pt_grad_wt = F.grad.conv1d_weight(
+                    in_pt,
+                    wt_pt.shape,
+                    ct_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                pt_grad_in = torch.transpose(pt_grad_in, 2, 1).numpy()
+                pt_grad_wt = torch.transpose(pt_grad_wt, 2, 1).numpy()
+
+                mx_grad_in, mx_grad_wt = outs_mx
+
+                self.assertListEqual(list(pt_grad_in.shape), mx_grad_in.shape)
+                self.assertListEqual(list(in_mx.shape), mx_grad_in.shape)
+                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))
+
+                self.assertListEqual(list(pt_grad_wt.shape), mx_grad_wt.shape)
+                self.assertListEqual(list(wt_mx.shape), mx_grad_wt.shape)
+                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))
+
+        for dtype in ("float32",):
+            for N, C, O in (
+                (1, 1, 1),
+                (1, 6, 1),
+                (1, 1, 6),
+                (4, 32, 64),
+            ):
+                for iH, kH, stride, padding in (
+                    (1, 1, 1, 0),
+                    (3, 3, 1, 0),
+                    (31, 5, 5, 2),
+                ):
+                    run_conv1D_grad(N, C, O, iH, kH, stride, padding, dtype=dtype)
+
+    @unittest.skipIf(not has_torch, "requires Torch")
+    def test_torch_conv_2D(self):
+        def run_conv2D(
+            N,
+            C,
+            O,
+            idim,
+            kdim,
+            stride,
+            padding,
+            dilation=(1, 1),
+            groups=1,
+            dtype="float32",
+            atol=1e-5,
+        ):
+            with self.subTest(
+                dtype=dtype,
+                N=N,
+                C=C,
+                O=O,
+                idim=idim,
+                kdim=kdim,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            ):
+                np_dtype = getattr(np, dtype)
+                np.random.seed(0)
+                iH, iW = idim
+                kH, kW = kdim
+                scale = 1.0 / math.sqrt(kH * kW * C)
+                in_np = np.random.normal(0.0, scale, (N, iH, iW, C)).astype(np_dtype)
+                wt_np = np.random.normal(0.0, 1.0, (O, kH, kW, C)).astype(np_dtype)
+
+                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
+                in_pt, wt_pt = map(
+                    lambda x: torch.from_numpy(x.transpose(0, 3, 1, 2)).to("cpu"),
+                    (in_np, wt_np),
+                )
+
+                out_mx = mx.conv2d(
+                    in_mx,
+                    wt_mx,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                out_pt = torch.conv2d(
+                    in_pt,
+                    wt_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                out_pt = torch.permute(out_pt, (0, 2, 3, 1)).numpy(force=True)
+
+                self.assertListEqual(list(out_pt.shape), list(out_mx.shape))
+                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))
+
+        for dtype in ("float32",):
+            for N, C, O in (
+                (1, 1, 1),
+                (1, 6, 1),
+                (1, 1, 6),
+                (4, 32, 64),
+            ):
+                for idim, kdim, stride, padding in (
+                    ((1, 1), (1, 1), (1, 1), (0, 0)),
+                    ((3, 3), (3, 1), (1, 1), (0, 0)),
+                    ((31, 31), (5, 5), (5, 5), (2, 2)),
+                ):
+                    run_conv2D(N, C, O, idim, kdim, stride, padding, dtype=dtype)
+
+    @unittest.skipIf(not has_torch, "requires Torch")
+    def test_torch_conv_2D_grad(self):
+        def run_conv2D_grad(
+            N,
+            C,
+            O,
+            idim,
+            kdim,
+            stride,
+            padding,
+            dilation=(1, 1),
+            groups=1,
+            dtype="float32",
+            atol=1e-5,
+        ):
+            with self.subTest(
+                dtype=dtype,
+                N=N,
+                C=C,
+                O=O,
+                idim=idim,
+                kdim=kdim,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            ):
+                np_dtype = getattr(np, dtype)
+                np.random.seed(0)
+                iH, iW = idim
+                kH, kW = kdim
+                scale = 1.0 / math.sqrt(kH * kW * C)
+
+                oH = 1 + (
+                    (iH + 2 * padding[0] - dilation[0] * (kH - 1) - 1) // stride[0]
+                )
+                oW = 1 + (
+                    (iW + 2 * padding[1] - dilation[1] * (kW - 1) - 1) // stride[1]
+                )
+
+                in_np = np.random.normal(0.0, scale, (N, iH, iW, C)).astype(np_dtype)
+                wt_np = np.random.normal(0.0, scale, (O, kH, kW, C)).astype(np_dtype)
+                ct_np = np.random.normal(0.0, scale, (N, oH, oW, O)).astype(np_dtype)
+
+                in_mx, wt_mx, ct_mx = map(mx.array, (in_np, wt_np, ct_np))
+                in_pt, wt_pt, ct_pt = map(
+                    lambda x: torch.from_numpy(x.transpose(0, 3, 1, 2)).to("cpu"),
+                    (in_np, wt_np, ct_np),
+                )
+
+                def f(a, b):
+                    return mx.conv2d(
+                        a,
+                        b,
+                        stride=stride,
+                        padding=padding,
+                        dilation=dilation,
+                        groups=groups,
+                    )
+
+                _, outs_mx = mx.vjp(
+                    f,
+                    [
+                        in_mx,
+                        wt_mx,
+                    ],
+                    [
+                        ct_mx,
+                    ],
+                )
+                pt_grad_in = F.grad.conv1d_input(
+                    in_pt.shape,
+                    wt_pt,
+                    ct_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                pt_grad_wt = F.grad.conv1d_weight(
+                    in_pt,
+                    wt_pt.shape,
+                    ct_pt,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                )
+                pt_grad_in = torch.permute(pt_grad_in, (0, 2, 3, 1)).numpy()
+                pt_grad_wt = torch.permute(pt_grad_wt, (0, 2, 3, 1)).numpy()
+
+                mx_grad_in, mx_grad_wt = outs_mx
+
+                self.assertListEqual(list(pt_grad_in.shape), mx_grad_in.shape)
+                self.assertListEqual(list(in_mx.shape), mx_grad_in.shape)
+                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))
+
+                self.assertListEqual(list(pt_grad_wt.shape), mx_grad_wt.shape)
+                self.assertListEqual(list(wt_mx.shape), mx_grad_wt.shape)
+                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))
+
+        for dtype in ("float32",):
+            for N, C, O in (
+                (1, 1, 1),
+                (1, 6, 1),
+                (1, 1, 6),
+                (4, 32, 64),
+            ):
+                for idim, kdim, stride, padding in (
+                    ((1, 1), (1, 1), (1, 1), (0, 0)),
+                    ((3, 3), (3, 1), (1, 1), (0, 0)),
+                    ((31, 31), (5, 5), (5, 5), (2, 2)),
+                ):
+                    run_conv2D_grad(N, C, O, idim, kdim, stride, padding, dtype=dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_load.py
+++ b/python/tests/test_load.py
@ -0,0 +1,157 @@
+import unittest
+import os
+import mlx.core as mx
+import numpy as np
+import tempfile
+
+import mlx_tests
+
+
+class TestLoad(mlx_tests.MLXTestCase):
+    dtypes = [
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float16",
+        "complex64",
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.test_dir_fid = tempfile.TemporaryDirectory()
+        cls.test_dir = cls.test_dir_fid.name
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.test_dir_fid.cleanup()
+
+    def test_save_and_load(self):
+
+        if not os.path.isdir(self.test_dir):
+            os.mkdir(self.test_dir)
+
+        for dt in self.dtypes:
+            with self.subTest(dtype=dt):
+                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
+                    with self.subTest(shape=shape):
+                        save_file_mlx = os.path.join(self.test_dir, f"mlx_{dt}_{i}.npy")
+                        save_file_npy = os.path.join(self.test_dir, f"npy_{dt}_{i}.npy")
+
+                        save_arr = np.random.uniform(0.0, 32.0, size=shape)
+                        save_arr_npy = save_arr.astype(getattr(np, dt))
+                        save_arr_mlx = mx.array(save_arr_npy)
+
+                        mx.save(save_file_mlx, save_arr_mlx)
+                        np.save(save_file_npy, save_arr_npy)
+
+                        # Load array saved by mlx as mlx array
+                        load_arr_mlx_mlx = mx.load(save_file_mlx)
+                        self.assertTrue(mx.array_equal(load_arr_mlx_mlx, save_arr_mlx))
+
+                        # Load array saved by numpy as mlx array
+                        load_arr_npy_mlx = mx.load(save_file_npy)
+                        self.assertTrue(mx.array_equal(load_arr_npy_mlx, save_arr_mlx))
+
+                        # Load array saved by mlx as numpy array
+                        load_arr_mlx_npy = np.load(save_file_mlx)
+                        self.assertTrue(np.array_equal(load_arr_mlx_npy, save_arr_npy))
+
+    def test_save_and_load_fs(self):
+
+        if not os.path.isdir(self.test_dir):
+            os.mkdir(self.test_dir)
+
+        for dt in self.dtypes:
+            with self.subTest(dtype=dt):
+                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
+                    with self.subTest(shape=shape):
+                        save_file_mlx = os.path.join(
+                            self.test_dir, f"mlx_{dt}_{i}_fs.npy"
+                        )
+                        save_file_npy = os.path.join(
+                            self.test_dir, f"npy_{dt}_{i}_fs.npy"
+                        )
+
+                        save_arr = np.random.uniform(0.0, 32.0, size=shape)
+                        save_arr_npy = save_arr.astype(getattr(np, dt))
+                        save_arr_mlx = mx.array(save_arr_npy)
+
+                        with open(save_file_mlx, "wb") as f:
+                            mx.save(f, save_arr_mlx)
+
+                        np.save(save_file_npy, save_arr_npy)
+
+                        # Load array saved by mlx as mlx array
+                        with open(save_file_mlx, "rb") as f:
+                            load_arr_mlx_mlx = mx.load(f)
+                        self.assertTrue(mx.array_equal(load_arr_mlx_mlx, save_arr_mlx))
+
+                        # Load array saved by numpy as mlx array
+                        with open(save_file_npy, "rb") as f:
+                            load_arr_npy_mlx = mx.load(f)
+                        self.assertTrue(mx.array_equal(load_arr_npy_mlx, save_arr_mlx))
+
+                        # Load array saved by mlx as numpy array
+                        load_arr_mlx_npy = np.load(save_file_mlx)
+                        self.assertTrue(np.array_equal(load_arr_mlx_npy, save_arr_npy))
+
+    def test_savez_and_loadz(self):
+        if not os.path.isdir(self.test_dir):
+            os.mkdir(self.test_dir)
+
+        for dt in self.dtypes:
+            with self.subTest(dtype=dt):
+                shapes = [(6,), (6, 6), (4, 1, 3, 1, 2)]
+                save_file_mlx_uncomp = os.path.join(
+                    self.test_dir, f"mlx_{dt}_uncomp.npz"
+                )
+                save_file_npy_uncomp = os.path.join(
+                    self.test_dir, f"npy_{dt}_uncomp.npz"
+                )
+                save_file_mlx_comp = os.path.join(self.test_dir, f"mlx_{dt}_comp.npz")
+                save_file_npy_comp = os.path.join(self.test_dir, f"npy_{dt}_comp.npz")
+
+                # Make dictionary of multiple
+                save_arrs_npy = {
+                    f"save_arr_{i}": np.random.uniform(
+                        0.0, 32.0, size=shapes[i]
+                    ).astype(getattr(np, dt))
+                    for i in range(len(shapes))
+                }
+                save_arrs_mlx = {k: mx.array(v) for k, v in save_arrs_npy.items()}
+
+                # Save as npz files
+                np.savez(save_file_npy_uncomp, **save_arrs_npy)
+                mx.savez(save_file_mlx_uncomp, **save_arrs_mlx)
+                np.savez_compressed(save_file_npy_comp, **save_arrs_npy)
+                mx.savez_compressed(save_file_mlx_comp, **save_arrs_mlx)
+
+                for save_file_npy, save_file_mlx in (
+                    (save_file_npy_uncomp, save_file_mlx_uncomp),
+                    (save_file_npy_comp, save_file_mlx_comp),
+                ):
+
+                    # Load array saved by mlx as mlx array
+                    load_arr_mlx_mlx = mx.load(save_file_mlx)
+                    for k, v in load_arr_mlx_mlx.items():
+                        self.assertTrue(mx.array_equal(save_arrs_mlx[k], v))
+
+                    # Load arrays saved by numpy as mlx arrays
+                    load_arr_npy_mlx = mx.load(save_file_npy)
+                    for k, v in load_arr_npy_mlx.items():
+                        self.assertTrue(mx.array_equal(save_arrs_mlx[k], v))
+
+                    # Load array saved by mlx as numpy array
+                    load_arr_mlx_npy = np.load(save_file_mlx)
+                    for k, v in load_arr_mlx_npy.items():
+                        self.assertTrue(np.array_equal(save_arrs_npy[k], v))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_nn.py
+++ b/python/tests/test_nn.py
@ -0,0 +1,231 @@
+import unittest
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx.utils import tree_flatten, tree_map, tree_unflatten
+import numpy as np
+import os
+import tempfile
+
+import mlx_tests
+
+
+class TestNN(mlx_tests.MLXTestCase):
+    def test_linear(self):
+        inputs = mx.zeros((10, 4))
+        layer = nn.Linear(input_dims=4, output_dims=8)
+        outputs = layer(inputs)
+        self.assertEqual(tuple(outputs.shape), (10, 8))
+
+    def test_cross_entropy(self):
+        logits = mx.array([[0.0, -float("inf")], [-float("inf"), 0.0]])
+        targets = mx.array([0, 1])
+        losses = nn.losses.cross_entropy(logits, targets)
+        self.assertTrue(mx.array_equal(losses, mx.zeros((2,))))
+
+    def test_gelu(self):
+        inputs = [1.15286231, -0.81037411, 0.35816911, 0.77484438, 0.66276414]
+
+        # From: jax.nn.gelu(np.array(inputs), approximate=False)
+        expected = np.array(
+            [1.0093501, -0.16925684, 0.22918941, 0.60498625, 0.49459383]
+        )
+
+        out = nn.GELU()(mx.array(inputs))
+        self.assertTrue(np.allclose(out, expected))
+
+        # Crudely check the approximations
+        x = mx.arange(-6.0, 6.0, 12 / 100)
+        y = nn.gelu(x)
+        y_hat1 = nn.gelu_approx(x)
+        y_hat2 = nn.gelu_fast_approx(x)
+        self.assertLess(mx.abs(y - y_hat1).max(), 0.0003)
+        self.assertLess(mx.abs(y - y_hat2).max(), 0.02)
+
+    def test_group_norm(self):
+        x = mx.arange(100, dtype=mx.float32)
+        x = x.reshape(1, 10, 10, 1)
+        x = mx.broadcast_to(x, (2, 10, 10, 4))
+        x = mx.concatenate([x, 0.5 * x], axis=-1)
+
+        # Group norm in groups last mode
+        g = nn.GroupNorm(2, 8)
+        y = g(x)
+        means = y.reshape(2, -1, 2).mean(axis=1)
+        var = y.reshape(2, -1, 2).var(axis=1)
+        self.assertTrue(np.allclose(means, np.zeros_like(means), atol=1e-6))
+        self.assertTrue(np.allclose(var, np.ones_like(var), atol=1e-6))
+        g.weight = g.weight * 2
+        g.bias = g.bias + 3
+        y = g(x)
+        means = y.reshape(2, -1, 2).mean(axis=1)
+        var = y.reshape(2, -1, 2).var(axis=1)
+        self.assertTrue(np.allclose(means, 3 * np.ones_like(means), atol=1e-6))
+        self.assertTrue(np.allclose(var, 4 * np.ones_like(var), atol=1e-6))
+
+        # Group norm in groups first mode
+        g = nn.GroupNorm(2, 8, pytorch_compatible=True)
+        y = g(x)
+        means = y.reshape(2, -1, 2, 4).mean(axis=(1, -1))
+        var = y.reshape(2, -1, 2, 4).var(axis=(1, -1))
+        self.assertTrue(np.allclose(means, np.zeros_like(means), atol=1e-6))
+        self.assertTrue(np.allclose(var, np.ones_like(var), atol=1e-6))
+        g.weight = g.weight * 2
+        g.bias = g.bias + 3
+        y = g(x)
+        means = y.reshape(2, -1, 2, 4).mean(axis=(1, -1))
+        var = y.reshape(2, -1, 2, 4).var(axis=(1, -1))
+        self.assertTrue(np.allclose(means, 3 * np.ones_like(means), atol=1e-6))
+        self.assertTrue(np.allclose(var, 4 * np.ones_like(var), atol=1e-6))
+
+    def test_conv1d(self):
+        N = 5
+        L = 12
+        ks = 3
+        C_in = 2
+        C_out = 4
+        x = mx.ones((N, L, C_in))
+        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks)
+        c.weight = mx.ones_like(c.weight)
+        y = c(x)
+        self.assertEqual(y.shape, [N, L - ks + 1, C_out])
+        self.assertTrue(mx.allclose(y, mx.full(y.shape, ks * C_in, mx.float32)))
+
+        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks, stride=2)
+        y = c(x)
+        self.assertEqual(y.shape, [N, (L - ks + 1) // 2, C_out])
+        self.assertTrue("bias" in c.parameters())
+
+        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks, bias=False)
+        self.assertTrue("bias" not in c.parameters())
+
+    def test_conv2d(self):
+        x = mx.ones((4, 8, 8, 3))
+        c = nn.Conv2d(3, 1, 8)
+        y = c(x)
+        self.assertEqual(y.shape, [4, 1, 1, 1])
+        c.weight = mx.ones_like(c.weight) / 8 / 8 / 3
+        y = c(x)
+        self.assertTrue(np.allclose(y[:, 0, 0, 0], x.mean(axis=(1, 2, 3))))
+
+        # 3x3 conv no padding stride 1
+        c = nn.Conv2d(3, 8, 3)
+        y = c(x)
+        self.assertEqual(y.shape, [4, 6, 6, 8])
+        self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)
+
+        # 3x3 conv padding 1 stride 1
+        c = nn.Conv2d(3, 8, 3, padding=1)
+        y = c(x)
+        self.assertEqual(y.shape, [4, 8, 8, 8])
+        self.assertLess(mx.abs(y[:, 1:7, 1:7] - c.weight.sum((1, 2, 3))).max(), 1e-4)
+        self.assertLess(
+            mx.abs(y[:, 0, 0] - c.weight[:, 1:, 1:].sum(axis=(1, 2, 3))).max(),
+            1e-4,
+        )
+        self.assertLess(
+            mx.abs(y[:, 7, 7] - c.weight[:, :-1, :-1].sum(axis=(1, 2, 3))).max(),
+            1e-4,
+        )
+        self.assertLess(
+            mx.abs(y[:, 1:7, 7] - c.weight[:, :, :-1].sum(axis=(1, 2, 3))).max(),
+            1e-4,
+        )
+        self.assertLess(
+            mx.abs(y[:, 7, 1:7] - c.weight[:, :-1, :].sum(axis=(1, 2, 3))).max(),
+            1e-4,
+        )
+
+        # 3x3 conv no padding stride 2
+        c = nn.Conv2d(3, 8, 3, padding=0, stride=2)
+        y = c(x)
+        self.assertEqual(y.shape, [4, 3, 3, 8])
+        self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)
+
+    def test_sequential(self):
+        x = mx.ones((10, 2))
+        m = nn.Sequential(nn.Linear(2, 10), nn.ReLU(), nn.Linear(10, 1))
+        y = m(x)
+        self.assertEqual(y.shape, [10, 1])
+        params = m.parameters()
+        self.assertTrue("layers" in params)
+        self.assertEqual(len(params["layers"]), 3)
+        self.assertTrue("weight" in params["layers"][0])
+        self.assertEqual(len(params["layers"][1]), 0)
+        self.assertTrue("weight" in params["layers"][2])
+
+        m.layers[1] = nn.relu
+        y2 = m(x)
+        self.assertTrue(mx.array_equal(y, y2))
+
+    def test_module_utilities(self):
+        m = nn.Sequential(
+            nn.Sequential(nn.Linear(2, 10), nn.relu),
+            nn.Sequential(nn.Linear(10, 10), nn.ReLU()),
+            nn.Linear(10, 1),
+            mx.sigmoid,
+        )
+
+        children = m.children()
+        self.assertTrue(isinstance(children, dict))
+        self.assertEqual(len(children), 1)
+        self.assertTrue(isinstance(children["layers"], list))
+        self.assertEqual(len(children["layers"]), 4)
+        self.assertEqual(children["layers"][3], {})
+        flat_children = tree_flatten(children, is_leaf=nn.Module.is_module)
+        self.assertEqual(len(flat_children), 3)
+
+        leaves = tree_flatten(m.leaf_modules(), is_leaf=nn.Module.is_module)
+        self.assertEqual(len(leaves), 4)
+        self.assertEqual(leaves[0][0], "layers.0.layers.0")
+        self.assertEqual(leaves[1][0], "layers.1.layers.0")
+        self.assertEqual(leaves[2][0], "layers.1.layers.1")
+        self.assertEqual(leaves[3][0], "layers.2")
+        self.assertTrue(leaves[0][1] is m.layers[0].layers[0])
+        self.assertTrue(leaves[1][1] is m.layers[1].layers[0])
+        self.assertTrue(leaves[2][1] is m.layers[1].layers[1])
+        self.assertTrue(leaves[3][1] is m.layers[2])
+
+        m.eval()
+
+        def assert_not_training(k, m):
+            self.assertFalse(m.training)
+
+        m.apply_to_modules(assert_not_training)
+
+        m.train()
+
+        def assert_training(k, m):
+            self.assertTrue(m.training)
+
+        m.apply_to_modules(assert_training)
+
+    def test_sin_pe(self):
+        m = nn.SinusoidalPositionalEncoding(16, min_freq=0.01)
+        x = mx.arange(10)
+        y = m(x)
+
+        self.assertEqual(y.shape, [10, 16])
+        similarities = y @ y.T
+        self.assertLess(
+            mx.abs(similarities[mx.arange(10), mx.arange(10)] - 1).max(), 1e-5
+        )
+
+    def test_io(self):
+        def make_model():
+            return nn.Sequential(nn.Linear(2, 2), nn.ReLU(), nn.Linear(2, 2))
+
+        m = make_model()
+        tdir = tempfile.TemporaryDirectory()
+        file = os.path.join(tdir.name, "model.npz")
+        m.save_weights(file)
+        m_load = make_model()
+        m_load.load_weights(file)
+        tdir.cleanup()
+
+        eq_tree = tree_map(mx.array_equal, m.parameters(), m_load.parameters())
+        self.assertTrue(all(tree_flatten(eq_tree)))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_optimizers.py
+++ b/python/tests/test_optimizers.py
@ -0,0 +1,29 @@
+import unittest
+
+import mlx.core as mx
+import mlx.optimizers as opt
+import mlx.utils
+
+import mlx_tests
+
+
+class TestOptimizers(mlx_tests.MLXTestCase):
+    def test_optimizers(self):
+        params = {
+            "first": [mx.zeros((10,)), mx.zeros((1,))],
+            "second": mx.zeros((1,)),
+        }
+        grads = mlx.utils.tree_map(lambda x: mx.ones_like(x), params)
+
+        for optim in [opt.SGD(0.1), opt.Adam(0.1)]:
+            update = optim.apply_gradients(grads, params)
+            mx.eval(update)
+            equal_shape = mlx.utils.tree_map(
+                lambda x, y: x.shape == y.shape, params, update
+            )
+            all_equal = all(v for _, v in mlx.utils.tree_flatten(equal_shape))
+            self.assertTrue(all_equal)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_random.py
+++ b/python/tests/test_random.py
@ -0,0 +1,192 @@
+import unittest
+
+import mlx.core as mx
+
+import mlx_tests
+
+
+class TestRandom(mlx_tests.MLXTestCase):
+    def test_global_rng(self):
+        mx.random.seed(3)
+        a = mx.random.uniform()
+        b = mx.random.uniform()
+
+        mx.random.seed(3)
+        x = mx.random.uniform()
+        y = mx.random.uniform()
+
+        self.assertEqual(a.item(), x.item())
+        self.assertEqual(y.item(), b.item())
+
+    def test_key(self):
+        k1 = mx.random.key(0)
+        k2 = mx.random.key(0)
+        self.assertTrue(mx.array_equal(k1, k2))
+
+        k2 = mx.random.key(1)
+        self.assertFalse(mx.array_equal(k1, k2))
+
+    def test_key_split(self):
+        key = mx.random.key(0)
+
+        k1, k2 = mx.random.split(key)
+        self.assertFalse(mx.array_equal(k1, k2))
+
+        r1, r2 = mx.random.split(key)
+        self.assertTrue(mx.array_equal(k1, r1))
+        self.assertTrue(mx.array_equal(k2, r2))
+
+        keys = mx.random.split(key, 10)
+        self.assertEqual(keys.shape, [10, 2])
+
+    def test_uniform(self):
+        key = mx.random.key(0)
+        a = mx.random.uniform(key=key)
+        self.assertEqual(a.shape, [])
+        self.assertEqual(a.dtype, mx.float32)
+
+        b = mx.random.uniform(key=key)
+        self.assertEqual(a.item(), b.item())
+
+        a = mx.random.uniform(shape=(2, 3))
+        self.assertEqual(a.shape, [2, 3])
+
+        a = mx.random.uniform(shape=(1000,), low=-1, high=5)
+        self.assertTrue(mx.all((a > -1) < 5).item())
+
+        a = mx.random.uniform(shape=(1000,), low=mx.array(-1), high=5)
+        self.assertTrue(mx.all((a > -1) < 5).item())
+
+    def test_normal(self):
+        key = mx.random.key(0)
+        a = mx.random.normal(key=key)
+        self.assertEqual(a.shape, [])
+        self.assertEqual(a.dtype, mx.float32)
+
+        b = mx.random.normal(key=key)
+        self.assertEqual(a.item(), b.item())
+
+        a = mx.random.normal(shape=(2, 3))
+        self.assertEqual(a.shape, [2, 3])
+
+        ## Generate in float16 or bfloat16
+        for t in [mx.float16, mx.bfloat16]:
+            a = mx.random.normal(dtype=t)
+            self.assertEqual(a.dtype, t)
+
+    def test_randint(self):
+        a = mx.random.randint(0, 1, [])
+        self.assertEqual(a.shape, [])
+        self.assertEqual(a.dtype, mx.int32)
+
+        shape = [88]
+        low = mx.array(3)
+        high = mx.array(15)
+
+        key = mx.random.key(0)
+        a = mx.random.randint(low, high, shape, key=key)
+        self.assertEqual(a.shape, shape)
+        self.assertEqual(a.dtype, mx.int32)
+
+        # Check using the same key yields the same value
+        b = mx.random.randint(low, high, shape, key=key)
+        self.assertListEqual(a.tolist(), b.tolist())
+
+        shape = [3, 4]
+        low = mx.reshape(mx.array([0] * 3), [3, 1])
+        high = mx.reshape(mx.array([12, 13, 14, 15]), [1, 4])
+
+        a = mx.random.randint(low, high, shape)
+        self.assertEqual(a.shape, shape)
+
+        a = mx.random.randint(-10, 10, [1000, 1000])
+        self.assertTrue(mx.all(-10 <= a).item() and mx.all(a < 10).item())
+
+        a = mx.random.randint(10, -10, [1000, 1000])
+        self.assertTrue(mx.all(a == 10).item())
+
+    def test_bernoulli(self):
+        a = mx.random.bernoulli()
+        self.assertEqual(a.shape, [])
+        self.assertEqual(a.dtype, mx.bool_)
+
+        a = mx.random.bernoulli(mx.array(0.5), [5])
+        self.assertEqual(a.shape, [5])
+
+        a = mx.random.bernoulli(mx.array([2.0, -2.0]))
+        self.assertEqual(a.tolist(), [True, False])
+        self.assertEqual(a.shape, [2])
+
+        p = mx.array([0.1, 0.2, 0.3])
+        mx.reshape(p, [1, 3])
+        x = mx.random.bernoulli(p, [4, 3])
+        self.assertEqual(x.shape, [4, 3])
+
+        with self.assertRaises(ValueError):
+            mx.random.bernoulli(p, [2])  # Bad shape
+
+        with self.assertRaises(ValueError):
+            mx.random.bernoulli(0, [2])  # Bad type
+
+    def test_truncated_normal(self):
+        a = mx.random.truncated_normal(-2.0, 2.0)
+        self.assertEqual(a.size, 1)
+        self.assertEqual(a.dtype, mx.float32)
+
+        a = mx.random.truncated_normal(mx.array([]), mx.array([]))
+        self.assertEqual(a.dtype, mx.float32)
+        self.assertEqual(a.size, 0)
+
+        lower = mx.reshape(mx.array([-2.0, 0.0]), [1, 2])
+        upper = mx.reshape(mx.array([0.0, 1.0, 2.0]), [3, 1])
+        a = mx.random.truncated_normal(lower, upper)
+
+        self.assertEqual(a.shape, [3, 2])
+        self.assertTrue(mx.all(lower <= a).item() and mx.all(a <= upper).item())
+
+        a = mx.random.truncated_normal(2.0, -2.0)
+        self.assertTrue(mx.all(a == 2.0).item())
+
+        a = mx.random.truncated_normal(-3.0, 3.0, [542, 399])
+        self.assertEqual(a.shape, [542, 399])
+
+        lower = mx.array([-2.0, -1.0])
+        higher = mx.array([1.0, 2.0, 3.0])
+        with self.assertRaises(ValueError):
+            mx.random.truncated_normal(lower, higher)  # Bad shape
+
+    def test_gumbel(self):
+        samples = mx.random.gumbel(shape=(100, 100))
+        self.assertEqual(samples.shape, [100, 100])
+        self.assertEqual(samples.dtype, mx.float32)
+        mean = 0.5772
+        # Std deviation of the sample mean is small (<0.02),
+        # so this test is pretty conservative
+        self.assertTrue(mx.abs(mx.mean(samples) - mean) < 0.2)
+
+    def test_categorical(self):
+        logits = mx.zeros((10, 20))
+        self.assertEqual(mx.random.categorical(logits, -1).shape, [10])
+        self.assertEqual(mx.random.categorical(logits, 0).shape, [20])
+        self.assertEqual(mx.random.categorical(logits, 1).shape, [10])
+
+        out = mx.random.categorical(logits)
+        self.assertEqual(out.shape, [10])
+        self.assertEqual(out.dtype, mx.uint32)
+        self.assertTrue(mx.max(out).item() < 20)
+
+        out = mx.random.categorical(logits, 0, [5, 20])
+        self.assertEqual(out.shape, [5, 20])
+        self.assertTrue(mx.max(out).item() < 10)
+
+        out = mx.random.categorical(logits, 1, num_samples=7)
+        self.assertEqual(out.shape, [10, 7])
+        out = mx.random.categorical(logits, 0, num_samples=7)
+        self.assertEqual(out.shape, [20, 7])
+
+        with self.assertRaises(ValueError):
+            mx.random.categorical(logits, shape=[10, 5], num_samples=5)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_tree.py
+++ b/python/tests/test_tree.py
@ -0,0 +1,26 @@
+import unittest
+
+import mlx.core as mx
+import mlx.utils
+
+import mlx_tests
+
+
+class TestTreeUtils(mlx_tests.MLXTestCase):
+    def test_tree_map(self):
+        tree = {"a": 0, "b": 1, "c": 2}
+        tree = mlx.utils.tree_map(lambda x: x + 1, tree)
+
+        expected_tree = {"a": 1, "b": 2, "c": 3}
+        self.assertEqual(tree, expected_tree)
+
+    def test_tree_flatten(self):
+        tree = [{"a": 1, "b": 2}, "c"]
+        vals = (1, 2, "c")
+        flat_tree = mlx.utils.tree_flatten(tree)
+        self.assertEqual(list(zip(*flat_tree))[1], vals)
+        self.assertEqual(mlx.utils.tree_unflatten(flat_tree), tree)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/tests/test_vmap.py
+++ b/python/tests/test_vmap.py
@ -0,0 +1,167 @@
+import unittest
+
+import mlx.core as mx
+
+import mlx_tests
+
+
+class TestVmap(mlx_tests.MLXTestCase):
+    def test_basics(self):
+        # Can't vmap over scalars
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp)(mx.array(1.0))
+
+        # Invalid input
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp)("hello")
+
+        # Invalid axes
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp, in_axes="hello")(mx.array([0, 1]))
+
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp, in_axes=2)(mx.array([0, 1]))
+
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp, out_axes="hello")(mx.array([0, 1]))
+
+        with self.assertRaises(ValueError):
+            mx.vmap(mx.exp, out_axes=2)(mx.array([0, 1]))
+
+    def test_unary(self):
+        ops = [
+            "abs",
+            "cos",
+            "erf",
+            "erfinv",
+            "exp",
+            "log",
+            "log1p",
+            "log2",
+            "log10",
+            "logical_not",
+            "negative",
+            "reciprocal",
+            "rsqrt",
+            "sigmoid",
+            "sign",
+            "sin",
+            "sqrt",
+            "square",
+        ]
+        ops = ["erfinv"]
+        for opname in ops:
+            with self.subTest(op=opname):
+                op = getattr(mx, opname)
+                x = mx.arange(5)
+                y = mx.vmap(op)(x)
+                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))
+
+                x = mx.arange(8).reshape(2, 4)
+                y = mx.vmap(op)(x)
+                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))
+
+                y = mx.vmap(op, in_axes=1, out_axes=1)(x)
+                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))
+
+    def test_binary(self):
+        ops = [
+            "add",
+            "divide",
+            "equal",
+            "greater",
+            "greater_equal",
+            "less",
+            "less_equal",
+            "logaddexp",
+            "maximum",
+            "minimum",
+            "multiply",
+            "power",
+            "subtract",
+        ]
+        for opname in ops:
+            with self.subTest(op=opname):
+                op = getattr(mx, opname)
+                x = mx.random.uniform(shape=(5,))
+                y = mx.random.uniform(shape=(5,))
+                out = mx.vmap(op)(x, y)
+                self.assertTrue(mx.array_equal(out, op(x, y)))
+
+                x = mx.random.uniform(shape=(2, 4))
+                y = mx.random.uniform(shape=(2, 4))
+                out = mx.vmap(op)(x, y)
+                self.assertTrue(mx.array_equal(out, op(x, y)))
+
+                out = mx.vmap(op, in_axes=(0, 0), out_axes=0)(x, y)
+                self.assertTrue(mx.array_equal(out, op(x, y)))
+
+                y = mx.random.uniform(shape=(4, 2))
+                out = mx.vmap(op, in_axes=(0, 1), out_axes=0)(x, y)
+                self.assertTrue(mx.array_equal(out, op(x, y.T)))
+
+                out = mx.vmap(op, in_axes=(0, 1), out_axes=1)(x, y)
+                self.assertTrue(mx.array_equal(out, op(x, y.T).T))
+
+    def test_tree(self):
+        def my_fun(tree):
+            return (tree["a"] + tree["b"][0]) * tree["b"][1]
+
+        tree = {
+            "a": mx.random.uniform(shape=(2, 4)),
+            "b": (
+                mx.random.uniform(shape=(2, 4)),
+                mx.random.uniform(shape=(2, 4)),
+            ),
+        }
+        out = mx.vmap(my_fun)(tree)
+        expected = my_fun(tree)
+        self.assertTrue(mx.array_equal(out, my_fun(tree)))
+
+        with self.assertRaises(ValueError):
+            mx.vmap(my_fun, in_axes={"a": 0, "b": 0}, out_axes=0)(tree)
+
+        with self.assertRaises(ValueError):
+            mx.vmap(my_fun, in_axes={"a": 0, "b": ((0, 0), 0)}, out_axes=0)(tree)
+
+        out = mx.vmap(my_fun, in_axes=({"a": 0, "b": 0},), out_axes=0)(tree)
+        self.assertTrue(mx.array_equal(out, my_fun(tree)))
+
+        out = mx.vmap(my_fun, in_axes=({"a": 0, "b": (0, 0)},), out_axes=0)(tree)
+        self.assertTrue(mx.array_equal(out, my_fun(tree)))
+
+        tree = {
+            "a": mx.random.uniform(shape=(2, 4)),
+            "b": (
+                mx.random.uniform(shape=(4, 2)),
+                mx.random.uniform(shape=(4, 2)),
+            ),
+        }
+        out = mx.vmap(my_fun, in_axes=({"a": 0, "b": (1, 1)},), out_axes=0)(tree)
+        expected = (tree["a"] + tree["b"][0].T) * tree["b"][1].T
+        self.assertTrue(mx.array_equal(out, expected))
+
+        def my_fun(x, y):
+            return {"a": x + y, "b": x * y}
+
+        x = mx.random.uniform(shape=(2, 4))
+        y = mx.random.uniform(shape=(2, 4))
+        out = mx.vmap(my_fun, in_axes=0, out_axes=0)(x, y)
+        expected = my_fun(x, y)
+        self.assertTrue(mx.array_equal(out["a"], expected["a"]))
+        self.assertTrue(mx.array_equal(out["b"], expected["b"]))
+
+        with self.assertRaises(ValueError):
+            mx.vmap(my_fun, in_axes=0, out_axes=(0, 1))(x, y)
+
+        with self.assertRaises(ValueError):
+            mx.vmap(my_fun, in_axes=0, out_axes={"a": 0, "c": 1})(x, y)
+
+        out = mx.vmap(my_fun, in_axes=0, out_axes={"a": 1, "b": 0})(x, y)
+        expected = my_fun(x, y)
+        self.assertTrue(mx.array_equal(out["a"].T, expected["a"]))
+        self.assertTrue(mx.array_equal(out["b"], expected["b"]))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,127 @@
+import os
+import re
+import subprocess
+import sys
+import sysconfig
+from pathlib import Path
+
+from setuptools import Extension, setup, find_namespace_packages
+from setuptools.command.build_ext import build_ext
+
+
+# A CMakeExtension needs a sourcedir instead of a file list.
+# The name must be the _single_ output extension from the CMake build.
+# If you need multiple extensions, see scikit-build.
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: str = "") -> None:
+        super().__init__(name, sources=[])
+        self.sourcedir = os.fspath(Path(sourcedir).resolve())
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext: CMakeExtension) -> None:
+        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)  # type: ignore[no-untyped-call]
+        extdir = ext_fullpath.parent.resolve()
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+            f"-DCMAKE_INSTALL_PREFIX={extdir}{os.sep}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",
+            "-DBUILD_SHARED_LIBS=ON",
+            "-DMLX_BUILD_PYTHON_BINDINGS=ON",
+            "-DMLX_BUILD_TESTS=OFF",
+            "-DMLX_BUILD_BENCHMARKS=OFF",
+            "-DMLX_BUILD_EXAMPLES=OFF",
+            f"-DMLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY={extdir}{os.sep}",
+        ]
+        build_args = []
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        # Pass version to C++
+        cmake_args += [f"-DMLX_VERSION={self.distribution.get_version()}"]  # type: ignore[attr-defined]
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]
+
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+
+        subprocess.run(
+            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
+        )
+        subprocess.run(
+            ["cmake", "--build", ".", "--target", "install", *build_args],
+            cwd=build_temp,
+            check=True,
+        )
+
+    # Make sure to copy mlx.metallib for inplace builds
+    def run(self):
+        super().run()
+
+        # Based on https://github.com/pypa/setuptools/blob/main/setuptools/command/build_ext.py#L102
+        if self.inplace:
+            for ext in self.extensions:
+                if ext.name == "mlx.core":
+                    # Resolve inplace package dir
+                    build_py = self.get_finalized_command("build_py")
+                    inplace_file, regular_file = self._get_inplace_equivalent(
+                        build_py, ext
+                    )
+
+                    inplace_dir = str(Path(inplace_file).parent.resolve())
+                    regular_dir = str(Path(regular_file).parent.resolve())
+
+                    self.copy_tree(regular_dir, inplace_dir)
+
+
+# The information here can also be placed in setup.cfg - better separation of
+# logic and declaration, and simpler if you include description/version in a file.
+if __name__ == "__main__":
+    packages = find_namespace_packages(
+        where="python", exclude=["src", "tests", "tests.*"]
+    )
+    package_dir = {"": "python"}
+    package_data = {"mlx": ["lib/*", "include/*", "share/*"]}
+    setup(
+        name="mlx",
+        version="0.0.2",
+        author="MLX Contributors",
+        author_email="mlx@group.apple.com",
+        description="A framework for machine learning on Apple Silicon.",
+        long_description="",
+        packages=packages,
+        package_dir=package_dir,
+        package_data=package_data,
+        include_package_data=True,
+        ext_modules=[CMakeExtension("mlx.core")],
+        cmdclass={"build_ext": CMakeBuild},
+        zip_safe=False,
+        python_requires=">=3.7",
+    )
--- a/tests/allocator_tests.cpp
+++ b/tests/allocator_tests.cpp
@ -0,0 +1,41 @@
+#include <stdexcept>
+
+#include "doctest/doctest.h"
+
+#include "mlx/allocator.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test simple allocations") {
+  {
+    auto buffer = allocator::malloc(sizeof(float));
+    auto fptr = static_cast<float*>(buffer.raw_ptr());
+    *fptr = 0.5f;
+    CHECK_EQ(*fptr, 0.5f);
+    allocator::free(buffer);
+  }
+
+  {
+    auto buffer = allocator::malloc(128 * sizeof(int));
+    int* ptr = static_cast<int*>(buffer.raw_ptr());
+    for (int i = 0; i < 128; ++i) {
+      ptr[i] = i;
+    }
+    allocator::free(buffer);
+  }
+
+  {
+    auto buffer = allocator::malloc(0);
+    allocator::free(buffer);
+  }
+}
+
+TEST_CASE("test large allocations") {
+  size_t size = 1 << 30;
+  for (int i = 0; i < 100; ++i) {
+    auto buffer = allocator::malloc(size);
+    allocator::free(buffer);
+  }
+  // Shouldn't be able to allocate an exabyte anytime soon.
+  CHECK_THROWS_AS(allocator::malloc(1ull << 60), std::runtime_error);
+}
--- a/tests/array_tests.cpp
+++ b/tests/array_tests.cpp
@ -0,0 +1,589 @@
+#include <climits>
+
+#include "doctest/doctest.h"
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test array basics") {
+  // Scalar
+  array x(1.0);
+  CHECK_EQ(x.size(), 1);
+  CHECK_EQ(x.ndim(), 0);
+  CHECK_EQ(x.shape(), std::vector<int>{});
+  CHECK_THROWS_AS(x.shape(0), std::out_of_range);
+  CHECK_THROWS_AS(x.shape(-1), std::out_of_range);
+  CHECK_EQ(x.strides(), std::vector<size_t>{});
+  CHECK_EQ(x.itemsize(), sizeof(float));
+  CHECK_EQ(x.nbytes(), sizeof(float));
+  CHECK_EQ(x.dtype(), float32);
+  CHECK_EQ(x.item<float>(), 1.0);
+
+  // Scalar with specified type
+  x = array(1, float32);
+  CHECK_EQ(x.dtype(), float32);
+  CHECK_EQ(x.item<float>(), 1.0);
+
+  // Scalar with specified type
+  x = array(1, bool_);
+  CHECK_EQ(x.dtype(), bool_);
+  CHECK_EQ(x.itemsize(), sizeof(bool));
+  CHECK_EQ(x.nbytes(), sizeof(bool));
+  CHECK_EQ(x.item<bool>(), true);
+
+  // Check shaped arrays
+  x = array({1.0});
+  CHECK_EQ(x.dtype(), float32);
+  CHECK_EQ(x.size(), 1);
+  CHECK_EQ(x.ndim(), 1);
+  CHECK_EQ(x.shape(), std::vector<int>{1});
+  CHECK_EQ(x.shape(0), 1);
+  CHECK_EQ(x.shape(-1), 1);
+  CHECK_THROWS_AS(x.shape(1), std::out_of_range);
+  CHECK_THROWS_AS(x.shape(-2), std::out_of_range);
+  CHECK_EQ(x.strides(), std::vector<size_t>{1});
+  CHECK_EQ(x.item<float>(), 1.0);
+
+  // Check empty array
+  x = array({});
+  CHECK_EQ(x.size(), 0);
+  CHECK_EQ(x.dtype(), float32);
+  CHECK_EQ(x.itemsize(), sizeof(float));
+  CHECK_EQ(x.nbytes(), 0);
+  CHECK_THROWS_AS(x.item<float>(), std::invalid_argument);
+
+  x = array({1.0, 1.0});
+  CHECK_EQ(x.size(), 2);
+  CHECK_EQ(x.shape(), std::vector<int>{2});
+  CHECK_EQ(x.itemsize(), sizeof(float));
+  CHECK_EQ(x.nbytes(), x.itemsize() * x.size());
+
+  // Accessing item in non-scalar array throws
+  CHECK_THROWS_AS(x.item<float>(), std::invalid_argument);
+
+  x = array({1.0, 1.0, 1.0}, {1, 3});
+  CHECK(x.size() == 3);
+  CHECK(x.shape() == std::vector<int>{1, 3});
+  CHECK(x.strides() == std::vector<size_t>{3, 1});
+
+  // Test wrong size/shapes throw:
+  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {4}), std::invalid_argument);
+  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {1, 4}), std::invalid_argument);
+  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {1, 2}), std::invalid_argument);
+
+  // Test array ids work as expected
+  x = array(1.0);
+  auto y = x;
+  CHECK_EQ(y.id(), x.id());
+  array z(2.0);
+  CHECK_NE(z.id(), x.id());
+  z = x;
+  CHECK_EQ(z.id(), x.id());
+
+  // Array creation from pointer
+  float data[] = {0.0, 1.0, 2.0, 3.0};
+  x = array(data, {4});
+  CHECK_EQ(x.dtype(), float32);
+  CHECK(array_equal(x, array({0.0, 1.0, 2.0, 3.0})).item<bool>());
+
+  // Array creation from vectors
+  {
+    std::vector<int> data = {0, 1, 2, 3};
+    x = array(data.begin(), {4});
+    CHECK_EQ(x.dtype(), int32);
+    CHECK(array_equal(x, array({0, 1, 2, 3})).item<bool>());
+  }
+
+  {
+    std::vector<bool> data = {false, true, false, true};
+    x = array(data.begin(), {4});
+    CHECK_EQ(x.dtype(), bool_);
+    CHECK(array_equal(x, array({false, true, false, true})).item<bool>());
+  }
+}
+
+TEST_CASE("test array types") {
+#define basic_dtype_test(T, mlx_type) \
+  T val = 42;                         \
+  array x(val);                       \
+  CHECK_EQ(x.dtype(), mlx_type);      \
+  CHECK_EQ(x.item<T>(), val);         \
+  x = array({val, val});              \
+  CHECK_EQ(x.dtype(), mlx_type);
+
+  // bool_
+  {
+    array x(true);
+    CHECK_EQ(x.dtype(), bool_);
+    CHECK_EQ(x.item<bool>(), true);
+
+    x = array({true, false});
+    CHECK_EQ(x.dtype(), bool_);
+
+    x = array({true, false}, float32);
+    CHECK_EQ(x.dtype(), float32);
+    CHECK(array_equal(x, array({1.0f, 0.0f})).item<bool>());
+  }
+
+  // uint8
+  { basic_dtype_test(uint8_t, uint8); }
+
+  // uint16
+  { basic_dtype_test(uint16_t, uint16); }
+
+  // uint32
+  { basic_dtype_test(uint32_t, uint32); }
+
+  // uint64
+  { basic_dtype_test(uint64_t, uint64); }
+
+  // int8
+  { basic_dtype_test(int8_t, int8); }
+
+  // int16
+  { basic_dtype_test(int16_t, int16); }
+
+  // int32
+  { basic_dtype_test(int32_t, int32); }
+
+  // int64
+  { basic_dtype_test(int64_t, int64); }
+
+  // float16
+  { basic_dtype_test(float16_t, float16); }
+
+  // float32
+  { basic_dtype_test(float, float32); }
+
+  // bfloat16
+  { basic_dtype_test(bfloat16_t, bfloat16); }
+
+  // uint32
+  {
+    uint32_t val = UINT_MAX;
+    array x(val);
+    CHECK_EQ(x.dtype(), uint32);
+    CHECK_EQ(x.item<uint32_t>(), val);
+
+    x = array({1u, 2u});
+    CHECK_EQ(x.dtype(), uint32);
+  }
+
+  // int32
+  {
+    array x(-1);
+    CHECK_EQ(x.dtype(), int32);
+    CHECK_EQ(x.item<int>(), -1);
+
+    x = array({-1, 2});
+    CHECK_EQ(x.dtype(), int32);
+
+    std::vector<int> data{0, 1, 2};
+    x = array(data.data(), {static_cast<int>(data.size())}, bool_);
+    CHECK_EQ(x.dtype(), bool_);
+    CHECK(array_equal(x, array({false, true, true})).item<bool>());
+  }
+
+  // int64
+  {
+    int64_t val = static_cast<int64_t>(INT_MIN) - 1;
+    array x(val);
+    CHECK_EQ(x.dtype(), int64);
+    CHECK_EQ(x.item<int64_t>(), val);
+
+    x = array({val, val});
+    CHECK_EQ(x.dtype(), int64);
+  }
+
+  // float32
+  {
+    array x(3.14f);
+    CHECK_EQ(x.dtype(), float32);
+    CHECK_EQ(x.item<float>(), 3.14f);
+
+    x = array(1.25);
+    CHECK_EQ(x.dtype(), float32);
+    CHECK_EQ(x.item<float>(), 1.25f);
+
+    x = array({1.0f, 2.0f});
+    CHECK_EQ(x.dtype(), float32);
+
+    x = array({1.0, 2.0});
+    CHECK_EQ(x.dtype(), float32);
+
+    std::vector<double> data{1.0, 2.0, 4.0};
+    x = array(data.data(), {static_cast<int>(data.size())});
+    CHECK_EQ(x.dtype(), float32);
+    CHECK(array_equal(x, array({1.0f, 2.0f, 4.0f})).item<bool>());
+  }
+
+  // complex64
+  {
+    complex64_t v = {1.0f, 1.0f};
+    array x(v);
+    CHECK_EQ(x.dtype(), complex64);
+    CHECK_EQ(x.item<complex64_t>(), v);
+
+    array y(std::complex<float>{1.0f, 1.0f});
+    CHECK_EQ(x.dtype(), complex64);
+    CHECK_EQ(x.item<complex64_t>(), v);
+  }
+
+#undef basic_dtype_test
+
+#define basic_dtype_str_test(s, dtype)         \
+  CHECK_EQ(s, dtype_to_array_protocol(dtype)); \
+  CHECK_EQ(dtype_from_array_protocol(s), dtype);
+
+  // To and from str
+  {
+    basic_dtype_str_test("|b1", bool_);
+    basic_dtype_str_test("|u1", uint8);
+    basic_dtype_str_test("<u2", uint16);
+    basic_dtype_str_test("<u4", uint32);
+    basic_dtype_str_test("<u8", uint64);
+    basic_dtype_str_test("|i1", int8);
+    basic_dtype_str_test("<i2", int16);
+    basic_dtype_str_test("<i4", int32);
+    basic_dtype_str_test("<i8", int64);
+    basic_dtype_str_test("<f2", float16);
+    basic_dtype_str_test("<f4", float32);
+    basic_dtype_str_test("<V2", bfloat16);
+    basic_dtype_str_test("<c8", complex64);
+  }
+
+#undef basic_dtype_str_test
+}
+
+TEST_CASE("test array metadata") {
+  array x(1.0f);
+  CHECK_EQ(x.data_size(), 1);
+  CHECK_EQ(x.flags().contiguous, true);
+  CHECK_EQ(x.flags().row_contiguous, true);
+  CHECK_EQ(x.flags().col_contiguous, true);
+
+  x = array({1.0f}, {1, 1, 1});
+  CHECK_EQ(x.data_size(), 1);
+  CHECK_EQ(x.flags().contiguous, true);
+  CHECK_EQ(x.flags().row_contiguous, true);
+  CHECK_EQ(x.flags().col_contiguous, true);
+
+  x = array({1.0f, 1.0f}, {1, 2});
+  CHECK_EQ(x.data_size(), 2);
+  CHECK_EQ(x.flags().contiguous, true);
+  CHECK_EQ(x.flags().row_contiguous, true);
+  CHECK_EQ(x.flags().col_contiguous, true);
+
+  x = zeros({1, 1, 4});
+  eval(x);
+  CHECK_EQ(x.data_size(), 4);
+  CHECK_EQ(x.flags().contiguous, true);
+  CHECK_EQ(x.flags().row_contiguous, true);
+  CHECK_EQ(x.flags().col_contiguous, true);
+
+  x = zeros({2, 4});
+  eval(x);
+  CHECK_EQ(x.data_size(), 8);
+  CHECK_EQ(x.flags().contiguous, true);
+  CHECK_EQ(x.flags().row_contiguous, true);
+  CHECK_EQ(x.flags().col_contiguous, false);
+
+  x = array(1.0f);
+  auto y = broadcast_to(x, {1, 1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = broadcast_to(x, {2, 8, 10});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  y = broadcast_to(x, {1, 0});
+  eval(y);
+  CHECK_EQ(y.data_size(), 0);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = broadcast_to(zeros({4, 2, 1}), {4, 2, 0});
+  eval(y);
+  CHECK_EQ(y.data_size(), 0);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array(1.0f);
+  y = transpose(x);
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({1, 1, 1});
+  y = transpose(x);
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({1, 1, 1});
+  y = transpose(x, {0, 1, 2});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({1, 1, 1});
+  y = transpose(x, {1, 2, 0});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({4, 1});
+  y = transpose(x);
+  eval(y);
+  CHECK_EQ(y.data_size(), 4);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({2, 3, 4});
+  y = transpose(x);
+  eval(y);
+  CHECK_EQ(y.data_size(), 24);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = transpose(x, {0, 2, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 24);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  y = transpose(transpose(x, {0, 2, 1}), {0, 2, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 24);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  x = array(1.0f);
+  y = reshape(x, {1, 1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({2, 4});
+  y = reshape(x, {8});
+  eval(y);
+  CHECK_EQ(y.data_size(), 8);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = reshape(x, {8, 1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 8);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = reshape(x, {1, 8, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 8);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({12});
+  y = reshape(x, {2, 3, 2});
+  eval(y);
+  CHECK_EQ(y.data_size(), 12);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  x = array(1.0f);
+  y = slice(x, {}, {});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f});
+  y = slice(x, {-10}, {10}, {10});
+  eval(y);
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
+  y = slice(x, {0, 0}, {1, 3}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 3);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
+  y = slice(x, {0, 0}, {1, 3}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 3);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
+  y = slice(x, {0, 0}, {0, 3}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 0);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
+  y = slice(x, {0, 0}, {1, 2}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 2);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
+  y = slice(x, {0, 0}, {1, 2}, {2, 3});
+  eval(y);
+  CHECK_EQ(y.shape(), std::vector<int>{1, 1});
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({0.0f, 1.0f, 2.0f, 3.0f}, {1, 4});
+  y = slice(x, {0, 0}, {1, 4}, {1, 2});
+  eval(y);
+  CHECK_EQ(y.shape(), std::vector<int>{1, 2});
+  CHECK_EQ(y.flags().contiguous, false);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  x = broadcast_to(array(1.0f), {4, 10});
+  y = slice(x, {0, 0}, {4, 10}, {2, 2});
+  eval(y);
+  CHECK_EQ(y.shape(), std::vector<int>{2, 5});
+  CHECK_EQ(y.data_size(), 1);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  x = broadcast_to(array({1.0f, 2.0f}), {4, 2});
+  y = slice(x, {0, 0}, {1, 2}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 2);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  y = slice(x, {1, 0}, {2, 2}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 2);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2});
+  y = slice(x, {0, 0}, {2, 2}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 4);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, true);
+  CHECK_EQ(y.flags().col_contiguous, false);
+
+  y = slice(transpose(x), {0, 0}, {2, 2}, {1, 1});
+  eval(y);
+  CHECK_EQ(y.data_size(), 4);
+  CHECK_EQ(y.flags().contiguous, true);
+  CHECK_EQ(y.flags().row_contiguous, false);
+  CHECK_EQ(y.flags().col_contiguous, true);
+
+  x = ones({2, 4});
+  auto out = split(x, 2);
+  eval(out);
+  for (auto y : out) {
+    CHECK_EQ(y.data_size(), 4);
+    CHECK_EQ(y.flags().contiguous, true);
+    CHECK_EQ(y.flags().row_contiguous, true);
+    CHECK_EQ(y.flags().col_contiguous, true);
+  }
+  out = split(x, 4, 1);
+  eval(out);
+  for (auto y : out) {
+    CHECK_EQ(y.flags().contiguous, false);
+    CHECK_EQ(y.flags().row_contiguous, false);
+    CHECK_EQ(y.flags().col_contiguous, false);
+  }
+}
+
+TEST_CASE("test array iteration") {
+  // Dim 0 arrays
+  auto arr = array(1);
+  CHECK_THROWS(arr.begin());
+
+  // Iterated arrays are read only
+  CHECK(std::is_const_v<decltype(*arr.begin())>);
+
+  arr = array({1, 2, 3, 4, 5});
+  int i = 0;
+  for (auto a : arr) {
+    i++;
+    CHECK_EQ(a.item<int>(), i);
+  }
+  CHECK_EQ(i, 5);
+
+  arr = array({1, 2, 3, 4}, {2, 2});
+  CHECK(array_equal(*arr.begin(), array({1, 2})).item<bool>());
+  CHECK(array_equal(*(arr.begin() + 1), array({3, 4})).item<bool>());
+  CHECK_EQ(arr.begin() + 2, arr.end());
+}
+
+TEST_CASE("test array shared buffer") {
+  std::vector<int> shape = {2, 2};
+  int n_elem = shape[0] * shape[1];
+
+  allocator::Buffer buf_b = allocator::malloc(n_elem * sizeof(float));
+  void* buf_b_ptr = buf_b.raw_ptr();
+  float* float_buf_b = (float*)buf_b_ptr;
+
+  for (int i = 0; i < n_elem; i++) {
+    float_buf_b[i] = 2.;
+  }
+
+  CHECK_EQ(float_buf_b[0], ((float*)buf_b_ptr)[0]);
+
+  auto deleter = [float_buf_b](allocator::Buffer buf) {
+    CHECK_EQ(float_buf_b, (float*)buf.raw_ptr());
+    CHECK_EQ(float_buf_b[0], ((float*)buf.raw_ptr())[0]);
+    allocator::free(buf);
+  };
+
+  array a = ones(shape, float32);
+  array b = array(buf_b, shape, float32, deleter);
+
+  eval(a + b);
+}
--- a/tests/autograd_tests.cpp
+++ b/tests/autograd_tests.cpp
--- a/tests/device_tests.cpp
+++ b/tests/device_tests.cpp
@ -0,0 +1,33 @@
+#include "doctest/doctest.h"
+
+#include <cstdlib>
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test device placement") {
+  auto device = default_device();
+  Device d = metal::is_available() ? Device::gpu : Device::cpu;
+  if (std::getenv("DEVICE") == nullptr) {
+    CHECK_EQ(device, d);
+  }
+
+  array x(1.0f);
+  array y(1.0f);
+  auto z = add(x, y, default_device());
+  if (metal::is_available()) {
+    z = add(x, y, Device::gpu);
+    z = add(x, y, Device(Device::gpu, 0));
+  } else {
+    CHECK_THROWS_AS(set_default_device(Device::gpu), std::invalid_argument);
+    CHECK_THROWS_AS(add(x, y, Device::gpu), std::invalid_argument);
+  }
+
+  // Set the default device to the CPU
+  set_default_device(Device::cpu);
+  CHECK_EQ(default_device(), Device::cpu);
+
+  // Revert
+  set_default_device(device);
+}
--- a/tests/eval_tests.cpp
+++ b/tests/eval_tests.cpp
@ -0,0 +1,97 @@
+#include "doctest/doctest.h"
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test eval") {
+  {
+    array x(1.0);
+    array y(1);
+    array z(true);
+    eval({x, y, z});
+    CHECK_EQ(x.item<float>(), 1.0);
+  }
+
+  {
+    array x(1.0);
+    array y = ones({2, 2});
+    array z(true);
+    eval({x, y, z});
+    CHECK(array_equal(y, array({1.0, 1.0, 1.0, 1.0}, {2, 2})).item<bool>());
+  }
+}
+
+TEST_CASE("test eval multiple") {
+  auto x = ones({10, 10});
+  auto y = ones({10, 10});
+  eval({x, y});
+  CHECK(array_equal(x, y).item<bool>());
+
+  auto a = x + y;
+  auto b = x - y;
+  eval({a, b});
+  CHECK(array_equal(a, full({10, 10}, 2.0f)).item<bool>());
+  CHECK(array_equal(b, full({10, 10}, 0.0f)).item<bool>());
+
+  x = ones({10, 10});
+  y = ones({10, 10});
+  eval(x, y);
+  CHECK(array_equal(x, y).item<bool>());
+
+  a = x + y;
+  b = x - y;
+  eval(a, b);
+  CHECK(array_equal(a, full({10, 10}, 2.0f)).item<bool>());
+  CHECK(array_equal(b, full({10, 10}, 0.0f)).item<bool>());
+}
+
+TEST_CASE("test eval with tracer") {
+  auto x = array(1);
+  x.set_tracer(true);
+
+  // Ok, x is not a node
+  eval(x);
+
+  x = ones({2, 3});
+  x.set_tracer(true);
+  CHECK_THROWS(eval(x));
+
+  // Ok retain_graph=true
+  eval({x}, true);
+
+  // Make sure all arguments are checked
+  auto y = ones({2, 3});
+  CHECK_THROWS(eval(x, y));
+}
+
+TEST_CASE("test eval graph retention") {
+  auto x = array(1);
+  auto y = array(2);
+  auto z = x + y;
+  eval({z}, true);
+  CHECK(z.has_primitive());
+  CHECK(z.is_evaled());
+  CHECK_EQ(z.item<int>(true), 3);
+  CHECK(z.has_primitive());
+  CHECK(z.is_evaled());
+
+  CHECK_EQ(z.item<int>(), 3);
+  CHECK(!z.has_primitive());
+  CHECK(z.is_evaled());
+
+  z = x + y;
+  auto a = z + x;
+  auto b = a + y;
+  eval({b}, true);
+  CHECK(z.has_primitive());
+  CHECK(z.is_evaled());
+  CHECK(a.has_primitive());
+  CHECK(a.is_evaled());
+
+  eval({b}, false);
+  CHECK(!z.has_primitive());
+  CHECK(z.is_evaled());
+  CHECK(!a.has_primitive());
+  CHECK(a.is_evaled());
+}
--- a/tests/load_tests.cpp
+++ b/tests/load_tests.cpp
@ -0,0 +1,81 @@
+#include <filesystem>
+#include <stdexcept>
+#include <vector>
+
+#include "doctest/doctest.h"
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+std::string get_temp_file(const std::string& name) {
+  return std::filesystem::temp_directory_path().append(name);
+}
+
+TEST_CASE("test single array serialization") {
+  // Basic test
+  {
+    auto a = random::uniform(-5.f, 5.f, {2, 5, 12}, float32);
+
+    std::string file_path = get_temp_file("test_arr.npy");
+
+    save(file_path, a);
+    auto b = load(file_path);
+
+    CHECK_EQ(a.dtype(), b.dtype());
+    CHECK_EQ(a.shape(), b.shape());
+    CHECK(array_equal(a, b).item<bool>());
+  }
+
+  // Other shapes
+  {
+    auto a = random::uniform(
+        -5.f,
+        5.f,
+        {
+            1,
+        },
+        float32);
+
+    std::string file_path = get_temp_file("test_arr_0.npy");
+
+    save(file_path, a);
+    auto b = load(file_path);
+
+    CHECK_EQ(a.dtype(), b.dtype());
+    CHECK_EQ(a.shape(), b.shape());
+    CHECK(array_equal(a, b).item<bool>());
+  }
+
+  {
+    auto a = random::uniform(
+        -5.f,
+        5.f,
+        {
+            46,
+        },
+        float32);
+
+    std::string file_path = get_temp_file("test_arr_1.npy");
+
+    save(file_path, a);
+    auto b = load(file_path);
+
+    CHECK_EQ(a.dtype(), b.dtype());
+    CHECK_EQ(a.shape(), b.shape());
+    CHECK(array_equal(a, b).item<bool>());
+  }
+
+  {
+    auto a = random::uniform(-5.f, 5.f, {5, 2, 1, 3, 4}, float32);
+
+    std::string file_path = get_temp_file("test_arr_2.npy");
+
+    save(file_path, a);
+    auto b = load(file_path);
+
+    CHECK_EQ(a.dtype(), b.dtype());
+    CHECK_EQ(a.shape(), b.shape());
+    CHECK(array_equal(a, b).item<bool>());
+  }
+}
--- a/tests/scheduler_tests.cpp
+++ b/tests/scheduler_tests.cpp
@ -0,0 +1,119 @@
+#include "doctest/doctest.h"
+
+#include "mlx/mlx.h"
+#include "mlx/scheduler.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test stream management") {
+  auto s1 = default_stream(default_device());
+  CHECK_EQ(s1.device, default_device());
+
+  auto s2 = new_stream(default_device());
+  CHECK_EQ(s2.device, default_device());
+  CHECK_NE(s1, s2);
+
+  // Check that default streams have the correct devices
+  if (metal::is_available()) {
+    auto s_gpu = default_stream(Device::gpu);
+    CHECK_EQ(s_gpu.device, Device::gpu);
+  } else {
+    CHECK_THROWS_AS(default_stream(Device::gpu), std::invalid_argument);
+  }
+  auto s_cpu = default_stream(Device::cpu);
+  CHECK_EQ(s_cpu.device, Device::cpu);
+
+  s_cpu = new_stream(Device::cpu);
+  CHECK_EQ(s_cpu.device, Device::cpu);
+
+  if (metal::is_available()) {
+    auto s_gpu = new_stream(Device::gpu);
+    CHECK_EQ(s_gpu.device, Device::gpu);
+  } else {
+    CHECK_THROWS_AS(new_stream(Device::gpu), std::invalid_argument);
+  }
+}
+
+TEST_CASE("test asynchronous launch") {
+  auto s1 = default_stream(default_device());
+  auto s2 = new_stream(default_device());
+
+  // Make sure streams execute asynchronously
+  int x = 1;
+  auto p1 = std::make_shared<std::promise<void>>();
+  auto p2 = std::make_shared<std::promise<void>>();
+  auto f1 = p1->get_future().share();
+  auto f2 = p2->get_future().share();
+  auto fn1 = [&x, p = std::move(p1)]() {
+    x++;
+    p->set_value();
+  };
+  auto fn2 = [&x, p = std::move(p2), f = std::move(f1)]() {
+    f.wait();
+    x *= 5;
+    p->set_value();
+  };
+
+  // fn2 is launched first and is waiting on fn1 but since
+  // they are on different streams there is no deadlock.
+  scheduler::enqueue(s2, std::move(fn2));
+  scheduler::enqueue(s1, std::move(fn1));
+
+  f2.wait();
+
+  CHECK_EQ(x, 10);
+}
+
+TEST_CASE("test stream placement") {
+  auto s1 = default_stream(default_device());
+  auto s2 = new_stream(default_device());
+
+  {
+    // Wait on stream 1
+    auto p = std::make_shared<std::promise<void>>();
+    auto f = p->get_future().share();
+    scheduler::enqueue(s1, [f = std::move(f)]() { f.wait(); });
+
+    // Do some work on stream 2
+    auto x = zeros({100}, float32, s2);
+    auto y = ones({100}, float32, s2);
+    auto z = add(x, y, s2);
+    eval(z);
+    p->set_value();
+  }
+
+  {
+    // Wait on stream 1
+    auto p = std::make_shared<std::promise<void>>();
+    auto f = p->get_future().share();
+    scheduler::enqueue(s1, [f = std::move(f)]() { f.wait(); });
+
+    // Do some work on stream 2
+    auto fn = [&s2](array a) { return add(a, add(a, a, s2), s2); };
+    auto x = zeros({100}, s2);
+
+    // The whole vjp computation should happen
+    // on the second stream otherwise this will hang.
+    auto [out, dout] = vjp(fn, x, ones({100}, s2));
+
+    // The whole jvp computation should happen on the
+    // second stream.
+    std::tie(out, dout) = jvp(fn, x, ones({100}, s2));
+    eval(out, dout);
+
+    p->set_value();
+  }
+}
+
+TEST_CASE("test scheduler races") {
+  auto x = zeros({1});
+  auto y = zeros({100});
+  eval(x, y);
+  auto a = exp(x);
+  eval(a);
+  a = exp(x);
+  for (int i = 0; i < 10000; ++i) {
+    y = exp(y);
+  }
+  eval(a, y);
+}
--- a/tests/utils_tests.cpp
+++ b/tests/utils_tests.cpp
@ -0,0 +1,26 @@
+
+#include "doctest/doctest.h"
+
+#include "mlx/mlx.h"
+
+using namespace mlx::core;
+
+TEST_CASE("test type promotion") {
+  for (auto t : {bool_, uint32, int32, int64, float32}) {
+    auto a = array(0, t);
+    CHECK_EQ(result_type({a}), t);
+
+    std::vector<array> arrs = {array(0, t), array(0, t)};
+    CHECK_EQ(result_type(arrs), t);
+  }
+
+  {
+    std::vector<array> arrs = {array(false), array(0, int32)};
+    CHECK_EQ(result_type(arrs), int32);
+  }
+
+  {
+    std::vector<array> arrs = {array(0, int32), array(false), array(0.0f)};
+    CHECK_EQ(result_type(arrs), float32);
+  }
+}
				`@ -0,0 +1 @@`
				`<meta http-equiv="refresh" content="0; url=./build/html/index.html" />`